Fixed some GGUFv1 loading bugs, long overdue cleanup for compiling, integrated TTS

tts is functional (+6 squashed commit)

Squashed commit:

[22396311] wip tts

[3a883027] tts not yet working

[0dcfab0e] fix silly bug

[a378d9ef] some long overdue cleanup

[fc5a6fb5] Wip tts

[39f50497] wip TTS integration
This commit is contained in:
Concedo 2025-01-12 16:33:02 +08:00
parent 12cdcf0abe
commit b3de1598e7
17 changed files with 1175 additions and 271 deletions

View file

@ -495,7 +495,9 @@ add_library(common2
examples/llava/clip.h examples/llava/clip.h
src/unicode.h src/unicode.h
src/unicode.cpp src/unicode.cpp
src/unicode-data.cpp) src/unicode-data.cpp
otherarch/utils.cpp
otherarch/utils.h)
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
@ -515,11 +517,18 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(tts_adapter
otherarch/tts_adapter.cpp)
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(gpttype_adapter add_library(gpttype_adapter
gpttype_adapter.cpp) gpttype_adapter.cpp)
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
@ -530,8 +539,16 @@ if (LLAMA_CUBLAS)
set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_custom_command(
TARGET koboldcpp_cublas POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:koboldcpp_cublas> # The generated DLL
${CMAKE_SOURCE_DIR}/ # Destination directory
COMMENT "Copying DLL to parent directory"
)
endif() endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
@ -542,7 +559,15 @@ if (LLAMA_HIPBLAS)
set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_custom_command(
TARGET koboldcpp_hipblas POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:koboldcpp_hipblas> # The generated DLL
${CMAKE_SOURCE_DIR}/ # Destination directory
COMMENT "Copying DLL to parent directory"
)
endif() endif()

View file

@ -4,7 +4,7 @@
.PHONY: finishedmsg .PHONY: finishedmsg
default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip ttsmain whispermain sdmain gguf-split
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
@ -90,10 +90,10 @@ endif
CUBLASLD_FLAGS = CUBLASLD_FLAGS =
CUBLAS_OBJS = CUBLAS_OBJS =
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o
# OS specific # OS specific
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)
@ -539,6 +539,8 @@ ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
kcpputils.o: otherarch/utils.cpp otherarch/utils.h
$(CXX) $(CXXFLAGS) -c $< -o $@
#these have special gpu defines #these have special gpu defines
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
@ -639,8 +641,12 @@ whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
#tts objects
tts_default.o: otherarch/tts_adapter.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
# idiotic "for easier compilation" # idiotic "for easier compilation"
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
gpttype_adapter.o: $(GPTTYPE_ADAPTER) gpttype_adapter.o: $(GPTTYPE_ADAPTER)
@ -680,11 +686,11 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp $(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
#generated libraries #generated libraries
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
$(DEFAULT_BUILD) $(DEFAULT_BUILD)
ifdef FAILSAFE_BUILD ifdef FAILSAFE_BUILD
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS) koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
$(FAILSAFE_BUILD) $(FAILSAFE_BUILD)
else else
koboldcpp_failsafe: koboldcpp_failsafe:
@ -692,7 +698,7 @@ koboldcpp_failsafe:
endif endif
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS) koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
$(NOAVX2_BUILD) $(NOAVX2_BUILD)
else else
koboldcpp_noavx2: koboldcpp_noavx2:
@ -700,10 +706,10 @@ koboldcpp_noavx2:
endif endif
ifdef CLBLAST_BUILD ifdef CLBLAST_BUILD
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
$(CLBLAST_BUILD) $(CLBLAST_BUILD)
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS) koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
$(CLBLAST_BUILD) $(CLBLAST_BUILD)
else else
koboldcpp_clblast_noavx2: koboldcpp_clblast_noavx2:
@ -717,7 +723,7 @@ koboldcpp_clblast_noavx2:
endif endif
ifdef CUBLAS_BUILD ifdef CUBLAS_BUILD
koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
$(CUBLAS_BUILD) $(CUBLAS_BUILD)
else else
koboldcpp_cublas: koboldcpp_cublas:
@ -725,7 +731,7 @@ koboldcpp_cublas:
endif endif
ifdef HIPBLAS_BUILD ifdef HIPBLAS_BUILD
koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
$(HIPBLAS_BUILD) $(HIPBLAS_BUILD)
else else
koboldcpp_hipblas: koboldcpp_hipblas:
@ -733,10 +739,10 @@ koboldcpp_hipblas:
endif endif
ifdef VULKAN_BUILD ifdef VULKAN_BUILD
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS) koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
$(VULKAN_BUILD) $(VULKAN_BUILD)
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS) koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
$(VULKAN_BUILD) $(VULKAN_BUILD)
else else
koboldcpp_vulkan_noavx2: koboldcpp_vulkan_noavx2:

View file

@ -238,6 +238,15 @@ extern "C"
return whispertype_generate(inputs); return whispertype_generate(inputs);
} }
bool tts_load_model(const tts_load_model_inputs inputs)
{
return ttstype_load_model(inputs);
}
tts_generation_outputs tts_generate(const tts_generation_inputs inputs)
{
return ttstype_generate(inputs);
}
const char * new_token(int idx) { const char * new_token(int idx) {
if (generated_tokens.size() <= idx || idx < 0) return nullptr; if (generated_tokens.size() <= idx || idx < 0) return nullptr;

View file

@ -139,6 +139,7 @@ struct last_logprobs_outputs {
int count = 0; int count = 0;
logprob_item * logprob_items = nullptr; logprob_item * logprob_items = nullptr;
}; };
struct sd_load_model_inputs struct sd_load_model_inputs
{ {
const char * model_filename = nullptr; const char * model_filename = nullptr;
@ -178,6 +179,7 @@ struct sd_generation_outputs
int status = -1; int status = -1;
const char * data = ""; const char * data = "";
}; };
struct whisper_load_model_inputs struct whisper_load_model_inputs
{ {
const char * model_filename = nullptr; const char * model_filename = nullptr;
@ -201,6 +203,30 @@ struct whisper_generation_outputs
const char * text = ""; const char * text = "";
}; };
struct tts_load_model_inputs
{
const char * ttc_model_filename = nullptr;
const char * cts_model_filename = nullptr;
const char * executable_path = nullptr;
const int clblast_info = 0;
const int cublas_info = 0;
const char * vulkan_info = nullptr;
const int gpulayers = 0;
const int debugmode = 0;
};
struct tts_generation_inputs
{
const char * prompt = nullptr;
const int speaker_seed = 0;
const int audio_seed = 0;
const bool quiet = false;
};
struct tts_generation_outputs
{
int status = -1;
const char * data = "";
};
extern std::string executable_path; extern std::string executable_path;
extern std::string lora_filename; extern std::string lora_filename;
extern std::string lora_base; extern std::string lora_base;

View file

@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
} }
if (ok && gr.read(n_kv_32)) { if (ok && gr.read(n_kv_32)) {
n_kv_32 = n_kv_32; n_kv = n_kv_32;
} else { } else {
ok = false; ok = false;
} }

View file

@ -21,12 +21,13 @@
#include <cctype> #include <cctype>
#include <locale> #include <locale>
#include "utils.h"
//for easier compilation //for easier compilation
//concat source files into one file for compilation purposes //concat source files into one file for compilation purposes
#include "llama_v2.cpp" #include "llama_v2.cpp"
#include "llama_v3.cpp" #include "llama_v3.cpp"
#include "src/llama.cpp" #include "src/llama.cpp"
#include "utils.cpp"
#include "gptj_v1.cpp" #include "gptj_v1.cpp"
#include "gptj_v2.cpp" #include "gptj_v2.cpp"
#include "gptj_v3.cpp" #include "gptj_v3.cpp"
@ -535,99 +536,6 @@ const char * kcpp_print_system_info(void) {
return s.c_str(); return s.c_str();
} }
struct kcpp_embd_batch { //duplcated from llava_embd_batch
std::vector<int32_t> pos;
std::vector<int32_t> n_seq_id;
std::vector<int32_t> seq_id_0;
std::vector<int32_t *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
int32_t seq_id = 0;
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
}
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
int32_t seq_id = 0;
int32_t n_tokens = tokens.size();
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids[n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens.data(),
/*embd =*/ nullptr,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
batch.logits[n_tokens - 1] = true;
}
};
//loads a model for speculative decoding. //loads a model for speculative decoding.
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers) static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
{ {
@ -664,7 +572,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
draft_ctx_params.type_k = base_ctx_params.type_k; draft_ctx_params.type_k = base_ctx_params.type_k;
draft_ctx_params.type_v = base_ctx_params.type_v; draft_ctx_params.type_v = base_ctx_params.type_v;
llama_model * draftmodel = llama_load_model_from_file(spec_model_filename.c_str(), draft_model_params); llama_model * draftmodel = llama_model_load_from_file(spec_model_filename.c_str(), draft_model_params);
draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params); draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params);
if(draft_ctx == NULL) if(draft_ctx == NULL)
{ {
@ -2252,7 +2160,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
kvos.push_back(kvo); kvos.push_back(kvo);
model_params.kv_overrides = kvos.data(); model_params.kv_overrides = kvos.data();
} }
llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params); llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
if(overwriteRope) if(overwriteRope)
{ {

View file

@ -1,3 +1,5 @@
#pragma once
#ifndef LLAMA_H #ifndef LLAMA_H
#define LLAMA_H #define LLAMA_H

View file

@ -53,6 +53,7 @@ fullsdmodelpath = "" #if empty, it's not initialized
mmprojpath = "" #if empty, it's not initialized mmprojpath = "" #if empty, it's not initialized
password = "" #if empty, no auth key required password = "" #if empty, no auth key required
fullwhispermodelpath = "" #if empty, it's not initialized fullwhispermodelpath = "" #if empty, it's not initialized
ttsmodelpath = "" #if empty, not initialized
maxctx = 4096 maxctx = 4096
maxhordectx = 4096 maxhordectx = 4096
maxhordelen = 400 maxhordelen = 400
@ -281,6 +282,26 @@ class whisper_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int), _fields_ = [("status", ctypes.c_int),
("data", ctypes.c_char_p)] ("data", ctypes.c_char_p)]
class tts_load_model_inputs(ctypes.Structure):
_fields_ = [("ttc_model_filename", ctypes.c_char_p),
("cts_model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("clblast_info", ctypes.c_int),
("cublas_info", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("gpulayers", ctypes.c_int),
("debugmode", ctypes.c_int)]
class tts_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p),
("speaker_seed", ctypes.c_int),
("audio_seed", ctypes.c_int),
("quiet", ctypes.c_bool)]
class tts_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("data", ctypes.c_char_p)]
def getdirpath(): def getdirpath():
return os.path.dirname(os.path.realpath(__file__)) return os.path.dirname(os.path.realpath(__file__))
def getabspath(): def getabspath():
@ -440,6 +461,10 @@ def init_library():
handle.whisper_load_model.restype = ctypes.c_bool handle.whisper_load_model.restype = ctypes.c_bool
handle.whisper_generate.argtypes = [whisper_generation_inputs] handle.whisper_generate.argtypes = [whisper_generation_inputs]
handle.whisper_generate.restype = whisper_generation_outputs handle.whisper_generate.restype = whisper_generation_outputs
handle.tts_load_model.argtypes = [tts_load_model_inputs]
handle.tts_load_model.restype = ctypes.c_bool
handle.tts_generate.argtypes = [tts_generation_inputs]
handle.tts_generate.restype = tts_generation_outputs
handle.last_logprobs.restype = last_logprobs_outputs handle.last_logprobs.restype = last_logprobs_outputs
handle.detokenize.argtypes = [token_count_outputs] handle.detokenize.argtypes = [token_count_outputs]
handle.detokenize.restype = ctypes.c_char_p handle.detokenize.restype = ctypes.c_char_p
@ -577,9 +602,13 @@ def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = al
maxlen = 32000 maxlen = 32000
if args.debugmode >= 1: if args.debugmode >= 1:
maxlen = 64000 maxlen = 64000
strlength = len(str) try:
if strlength > maxlen: #limit max output len strlength = len(str)
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)" if strlength > maxlen: #limit max output len
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
except Exception:
pass
try: try:
print(str) print(str)
except UnicodeEncodeError: except UnicodeEncodeError:
@ -647,13 +676,14 @@ def read_gguf_metadata(file_path):
except Exception: except Exception:
return None return None
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath): def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath):
global modelfile_extracted_meta global modelfile_extracted_meta
modelfile_extracted_meta = None modelfile_extracted_meta = None
sdfsize = 0 sdfsize = 0
whisperfsize = 0 whisperfsize = 0
mmprojsize = 0 mmprojsize = 0
draftmodelsize = 0 draftmodelsize = 0
ttsmodelsize = 0
if sdfilepath and os.path.exists(sdfilepath): if sdfilepath and os.path.exists(sdfilepath):
sdfsize = os.path.getsize(sdfilepath) sdfsize = os.path.getsize(sdfilepath)
if whisperfilepath and os.path.exists(whisperfilepath): if whisperfilepath and os.path.exists(whisperfilepath):
@ -662,12 +692,14 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
mmprojsize = os.path.getsize(mmprojfilepath) mmprojsize = os.path.getsize(mmprojfilepath)
if draftmodelpath and os.path.exists(draftmodelpath): if draftmodelpath and os.path.exists(draftmodelpath):
draftmodelsize = os.path.getsize(draftmodelpath) draftmodelsize = os.path.getsize(draftmodelpath)
if ttsmodelpath and os.path.exists(ttsmodelpath):
ttsmodelsize = os.path.getsize(ttsmodelpath)
if filepath and os.path.exists(filepath): if filepath and os.path.exists(filepath):
try: try:
fsize = os.path.getsize(filepath) fsize = os.path.getsize(filepath)
if fsize>10000000: #dont bother with models < 10mb as they are probably bad if fsize>10000000: #dont bother with models < 10mb as they are probably bad
ggufmeta = read_gguf_metadata(filepath) ggufmeta = read_gguf_metadata(filepath)
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize,ttsmodelsize] #extract done. note that meta may be null
except Exception: except Exception:
modelfile_extracted_meta = None modelfile_extracted_meta = None
@ -699,6 +731,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
mem -= 350*1024*1024 mem -= 350*1024*1024
if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
mem -= (modelfile_extracted_meta[5] * 1.5) mem -= (modelfile_extracted_meta[5] * 1.5)
if modelfile_extracted_meta[6] > 1024*1024*10: #tts model tax
mem -= max(600*1024*1024, modelfile_extracted_meta[6] * 3)
mem = 0 if mem < 0 else mem mem = 0 if mem < 0 else mem
csmul = 1.0 csmul = 1.0
@ -730,6 +764,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
FetchedCUdevices = [] FetchedCUdevices = []
FetchedCUdeviceMem = [] FetchedCUdeviceMem = []
FetchedCUfreeMem = [] FetchedCUfreeMem = []
faileddetectvram = False
AMDgpu = None AMDgpu = None
try: # Get NVIDIA GPU names try: # Get NVIDIA GPU names
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@ -737,6 +773,10 @@ def fetch_gpu_properties(testCL,testCU,testVK):
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()] FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()] FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
except Exception: except Exception:
FetchedCUdevices = []
FetchedCUdeviceMem = []
FetchedCUfreeMem = []
faileddetectvram = True
pass pass
if len(FetchedCUdevices)==0: if len(FetchedCUdevices)==0:
try: # Get AMD ROCm GPU names try: # Get AMD ROCm GPU names
@ -756,18 +796,30 @@ def fetch_gpu_properties(testCL,testCU,testVK):
if getamdvram: if getamdvram:
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()] FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
except Exception: except Exception:
FetchedCUdevices = []
FetchedCUdeviceMem = []
FetchedCUfreeMem = []
faileddetectvram = True
pass pass
lowestcumem = 0 lowestcumem = 0
lowestfreecumem = 0 lowestfreecumem = 0
for idx in range(0,4): try:
if(len(FetchedCUdevices)>idx): for idx in range(0,4):
CUDevicesNames[idx] = FetchedCUdevices[idx] if(len(FetchedCUdevices)>idx):
if len(FetchedCUdeviceMem)>idx: CUDevicesNames[idx] = FetchedCUdevices[idx]
dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024) if len(FetchedCUdeviceMem)>idx:
lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem) dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
if len(FetchedCUfreeMem)>idx: lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
dmem = (int(FetchedCUfreeMem[idx])*1024*1024) if len(FetchedCUfreeMem)>idx:
lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem) dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
except Exception:
lowestcumem = 0
lowestfreecumem = 0
faileddetectvram = True
if faileddetectvram:
print("Unable to detect VRAM, please set layers manually.")
MaxMemory[0] = max(lowestcumem,MaxMemory[0]) MaxMemory[0] = max(lowestcumem,MaxMemory[0])
MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0]) MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
@ -1264,6 +1316,34 @@ def whisper_generate(genparams):
outstr = ret.data.decode("UTF-8","ignore") outstr = ret.data.decode("UTF-8","ignore")
return outstr return outstr
def tts_load_model(ttc_model_filename,cts_model_filename):
global args
inputs = tts_load_model_inputs()
inputs.debugmode = args.debugmode
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
inputs.gpulayers = (999 if args.ttsgpu else 0)
inputs = set_backend_props(inputs)
ret = handle.tts_load_model(inputs)
return ret
def tts_generate(genparams):
global args
is_quiet = True if (args.quiet or args.debugmode == -1) else False
prompt = genparams.get("input", "")
prompt = prompt.strip()
inputs = tts_generation_inputs()
inputs.prompt = prompt.encode("UTF-8")
inputs.speaker_seed = 0
inputs.audio_seed = 0
inputs.quiet = is_quiet
ret = handle.tts_generate(inputs)
outstr = ""
if ret.status==1:
outstr = ret.data.decode("UTF-8","ignore")
return outstr
def tokenize_ids(countprompt,tcaddspecial): def tokenize_ids(countprompt,tcaddspecial):
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial) rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0 countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
@ -1738,10 +1818,11 @@ def LaunchWebbrowser(target_url, failedmsg):
try: try:
import webbrowser as wb import webbrowser as wb
if wb.open(target_url, autoraise=True): if wb.open(target_url, autoraise=True):
return return
raise RuntimeError("Cannot open default browser") raise RuntimeError("Cannot open default browser")
except Exception: except Exception as e:
try: try:
print(f"Browser failed to launch: {e}, attempting to use xdg-open...")
import webbrowser as wb import webbrowser as wb
if wb.get('xdg-open').open(target_url, autoraise=True): if wb.get('xdg-open').open(target_url, autoraise=True):
return return
@ -2102,7 +2183,7 @@ Enter Prompt:<br>
def do_GET(self): def do_GET(self):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
self.path = self.path.rstrip('/') self.path = self.path.rstrip('/')
response_body = None response_body = None
content_type = 'application/json' content_type = 'application/json'
@ -2160,7 +2241,8 @@ Enter Prompt:<br>
has_password = (password!="") has_password = (password!="")
has_whisper = (fullwhispermodelpath!="") has_whisper = (fullwhispermodelpath!="")
has_search = True if args.websearch else False has_search = True if args.websearch else False
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search}).encode()) has_tts = (ttsmodelpath!="")
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts}).encode())
elif self.path.endswith(('/api/extra/perf')): elif self.path.endswith(('/api/extra/perf')):
global last_req_time, start_time global last_req_time, start_time
@ -2521,7 +2603,7 @@ Enter Prompt:<br>
reqblocking = False reqblocking = False
muint = int(args.multiuser) muint = int(args.multiuser)
if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="")): if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="")):
muint = 2 # this prevents errors when using voice/img together with text muint = 2 # this prevents errors when using voice/img together with text
multiuserlimit = ((muint-1) if muint > 1 else 6) multiuserlimit = ((muint-1) if muint > 1 else 6)
#backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1 #backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1
@ -2546,6 +2628,7 @@ Enter Prompt:<br>
is_imggen = False is_imggen = False
is_comfyui_imggen = False is_comfyui_imggen = False
is_transcribe = False is_transcribe = False
is_tts = False
if self.path.endswith('/request'): if self.path.endswith('/request'):
api_format = 1 api_format = 1
@ -2588,11 +2671,14 @@ Enter Prompt:<br>
if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'): if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
is_transcribe = True is_transcribe = True
if is_imggen or is_transcribe or api_format > 0: if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'):
is_tts = True
if is_imggen or is_transcribe or is_tts or api_format > 0:
global last_req_time global last_req_time
last_req_time = time.time() last_req_time = time.time()
if not is_imggen and not is_transcribe and api_format!=5: if not is_imggen and not is_transcribe and not is_tts and api_format!=5:
if not self.secure_endpoint(): if not self.secure_endpoint():
return return
@ -2680,6 +2766,21 @@ Enter Prompt:<br>
print("Transcribe: The response could not be sent, maybe connection was terminated?") print("Transcribe: The response could not be sent, maybe connection was terminated?")
time.sleep(0.2) #short delay time.sleep(0.2) #short delay
return return
elif is_tts:
try:
gen = tts_generate(genparams)
wav_data = b''
if gen:
wav_data = base64.b64decode(gen) # Decode the Base64 string into binary data
self.send_response(200)
self.send_header('content-length', str(len(wav_data))) # Set content length
self.end_headers(content_type='audio/wav')
self.wfile.write(wav_data) # Write the binary WAV data to the response
except Exception as ex:
utfprint(ex,0)
print("TTS: The response could not be sent, maybe connection was terminated?")
time.sleep(0.2) #short delay
return
finally: finally:
time.sleep(0.05) time.sleep(0.05)
@ -2806,7 +2907,7 @@ def show_gui():
if dlfile: if dlfile:
args.model_param = dlfile args.model_param = dlfile
load_config_cli(args.model_param) load_config_cli(args.model_param)
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel: if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
global exitcounter global exitcounter
exitcounter = 999 exitcounter = 999
exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.") exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
@ -3008,6 +3109,9 @@ def show_gui():
sd_quant_var = ctk.IntVar(value=0) sd_quant_var = ctk.IntVar(value=0)
whisper_model_var = ctk.StringVar() whisper_model_var = ctk.StringVar()
tts_model_var = ctk.StringVar()
wavtokenizer_var = ctk.StringVar()
ttsgpu_var = ctk.IntVar(value=0)
def tabbuttonaction(name): def tabbuttonaction(name):
for t in tabcontent: for t in tabcontent:
@ -3158,7 +3262,8 @@ def show_gui():
whisperfilepath = whisper_model_var.get() whisperfilepath = whisper_model_var.get()
mmprojfilepath = mmproj_var.get() mmprojfilepath = mmproj_var.get()
draftmodelpath = draftmodel_var.get() draftmodelpath = draftmodel_var.get()
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath) ttsmodelpath = tts_model_var.get() if ttsgpu_var.get()==1 else ""
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath)
changed_gpulayers_estimate() changed_gpulayers_estimate()
pass pass
@ -3575,8 +3680,14 @@ def show_gui():
# audio tab # audio tab
audio_tab = tabcontent["Audio"] audio_tab = tabcontent["Audio"]
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded.") makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
whisper_model_var.trace("w", gui_changed_modelfile) whisper_model_var.trace("w", gui_changed_modelfile)
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
tts_model_var.trace("w", gui_changed_modelfile)
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
wavtokenizer_var.trace("w", gui_changed_modelfile)
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
ttsgpu_var.trace("w", gui_changed_modelfile)
def kcpp_export_template(): def kcpp_export_template():
nonlocal kcpp_exporting_template nonlocal kcpp_exporting_template
@ -3625,7 +3736,7 @@ def show_gui():
# launch # launch
def guilaunch(): def guilaunch():
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1: if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and nomodel.get()!=1:
tmp = askopenfilename(title="Select ggml model .bin or .gguf file") tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
model_var.set(tmp) model_var.set(tmp)
nonlocal nextstate nonlocal nextstate
@ -3792,6 +3903,11 @@ def show_gui():
if whisper_model_var.get() != "": if whisper_model_var.get() != "":
args.whispermodel = whisper_model_var.get() args.whispermodel = whisper_model_var.get()
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
args.ttsmodel = tts_model_var.get()
args.ttswavtokenizer = wavtokenizer_var.get()
args.ttsgpu = (ttsgpu_var.get()==1)
def import_vars(dict): def import_vars(dict):
global importvars_in_progress global importvars_in_progress
importvars_in_progress = True importvars_in_progress = True
@ -3952,6 +4068,10 @@ def show_gui():
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "") whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
importvars_in_progress = False importvars_in_progress = False
gui_changed_modelfile() gui_changed_modelfile()
if "istemplate" in dict and dict["istemplate"]: if "istemplate" in dict and dict["istemplate"]:
@ -4022,7 +4142,7 @@ def show_gui():
kcpp_exporting_template = False kcpp_exporting_template = False
export_vars() export_vars()
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel: if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
exitcounter = 999 exitcounter = 999
print("") print("")
time.sleep(0.5) time.sleep(0.5)
@ -4566,7 +4686,7 @@ def analyze_gguf_model_wrapper(filename=""):
def main(launch_args,start_server=True): def main(launch_args,start_server=True):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
args = launch_args args = launch_args
if (args.version) and len(sys.argv) <= 2: if (args.version) and len(sys.argv) <= 2:
@ -4629,7 +4749,7 @@ def main(launch_args,start_server=True):
if not args.model_param: if not args.model_param:
args.model_param = args.model args.model_param = args.model
if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel): if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel):
#give them a chance to pick a file #give them a chance to pick a file
print("For command line arguments, please refer to --help") print("For command line arguments, please refer to --help")
print("***") print("***")
@ -4753,6 +4873,14 @@ def main(launch_args,start_server=True):
dlfile = download_model_from_url(args.draftmodel,[".gguf"]) dlfile = download_model_from_url(args.draftmodel,[".gguf"])
if dlfile: if dlfile:
args.draftmodel = dlfile args.draftmodel = dlfile
if args.ttsmodel and args.ttsmodel!="":
dlfile = download_model_from_url(args.ttsmodel,[".gguf"])
if dlfile:
args.ttsmodel = dlfile
if args.ttswavtokenizer and args.ttswavtokenizer!="":
dlfile = download_model_from_url(args.ttswavtokenizer,[".gguf"])
if dlfile:
args.ttswavtokenizer = dlfile
# sanitize and replace the default vanity name. remember me.... # sanitize and replace the default vanity name. remember me....
if args.model_param and args.model_param!="": if args.model_param and args.model_param!="":
@ -4830,7 +4958,7 @@ def main(launch_args,start_server=True):
pass pass
if args.gpulayers==-1: if args.gpulayers==-1:
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"): if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel) extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "")
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize) layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
print(f"Auto Recommended GPU Layers: {layeramt}") print(f"Auto Recommended GPU Layers: {layeramt}")
args.gpulayers = layeramt args.gpulayers = layeramt
@ -4999,6 +5127,27 @@ def main(launch_args,start_server=True):
exitcounter = 999 exitcounter = 999
exit_with_error(3,"Could not load whisper model: " + whispermodel) exit_with_error(3,"Could not load whisper model: " + whispermodel)
#handle tts model
if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="":
if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer):
if args.ignoremissing:
print("Ignoring missing TTS model files!")
args.ttsmodel = None
args.ttswavtokenizer = None
else:
exitcounter = 999
exit_with_error(2,f"Cannot find tts model files: {args.ttsmodel} or {args.ttswavtokenizer}")
else:
ttsmodelpath = args.ttsmodel
ttsmodelpath = os.path.abspath(ttsmodelpath)
wavtokpath = args.ttswavtokenizer
wavtokpath = os.path.abspath(wavtokpath)
loadok = tts_load_model(ttsmodelpath,wavtokpath)
print("Load TTS Model OK: " + str(loadok))
if not loadok:
exitcounter = 999
exit_with_error(3,"Could not load TTS model!")
#load embedded lite #load embedded lite
try: try:
@ -5296,7 +5445,12 @@ if __name__ == '__main__':
sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true') sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true')
whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands') whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="") whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!') deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+') deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')

View file

@ -105,6 +105,9 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs);
bool whispertype_load_model(const whisper_load_model_inputs inputs); bool whispertype_load_model(const whisper_load_model_inputs inputs);
whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs); whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs);
bool ttstype_load_model(const tts_load_model_inputs inputs);
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs);
void timer_start(); void timer_start();
double timer_check(); double timer_check();
void print_tok_vec(std::vector<int> &embd); void print_tok_vec(std::vector<int> &embd);

View file

@ -188,13 +188,8 @@
#endif #endif
// TODO: support for clang // TODO: support for clang
#ifdef __GNUC__
# define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define GGML_V3_DEPRECATED(func, hint) func # define GGML_V3_DEPRECATED(func, hint) func
#endif
#ifndef __GNUC__ #ifndef __GNUC__
# define GGML_V3_ATTRIBUTE_FORMAT(...) # define GGML_V3_ATTRIBUTE_FORMAT(...)

View file

@ -436,19 +436,23 @@ struct llama_v2_file_loader {
uint32_t magic = file.read_u32(); uint32_t magic = file.read_u32();
uint32_t version = 0; uint32_t version = 0;
if (magic != 'ggml') { uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
uint32_t magic_ggmf = 0x67676d66u; // 'ggmf'
uint32_t magic_ggml = 0x67676d6cu; // 'ggml'
if (magic != magic_ggml) {
version = file.read_u32(); version = file.read_u32();
} }
if (magic == 'ggml' && version == 0) { if (magic == magic_ggml && version == 0) {
file_version = LLAMA_V2_FILE_VERSION_GGML; file_version = LLAMA_V2_FILE_VERSION_GGML;
} else if (magic == 'ggmf' && version == 1) { } else if (magic == magic_ggmf && version == 1) {
file_version = LLAMA_V2_FILE_VERSION_GGMF_V1; file_version = LLAMA_V2_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) { } else if (magic == magic_ggjt && version == 1) {
file_version = LLAMA_V2_FILE_VERSION_GGJT_V1; file_version = LLAMA_V2_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) { } else if (magic == magic_ggjt && version == 2) {
file_version = LLAMA_V2_FILE_VERSION_GGJT_V2; file_version = LLAMA_V2_FILE_VERSION_GGJT_V2;
} else if (magic == 'ggjt' && version == 3) { } else if (magic == magic_ggjt && version == 3) {
file_version = LLAMA_V2_FILE_VERSION_GGJT_V3; file_version = LLAMA_V2_FILE_VERSION_GGJT_V3;
} else { } else {
throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
@ -553,7 +557,8 @@ struct llama_v2_file_saver {
write_vocab(); write_vocab();
} }
void write_magic() { void write_magic() {
file.write_u32(LLAMA_V2_FILE_MAGIC); // magic uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
file.write_u32(magic_ggjt); // magic
file.write_u32(LLAMA_V2_FILE_VERSION); // version file.write_u32(LLAMA_V2_FILE_VERSION); // version
} }
void write_hparams(enum llama_v2_ftype new_ftype) { void write_hparams(enum llama_v2_ftype new_ftype) {
@ -2308,7 +2313,8 @@ int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const
{ {
uint32_t magic; uint32_t magic;
fin.read((char *) &magic, sizeof(magic)); fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') { uint32_t magic_ggla = 0x67676c61u; // 'ggla'
if (magic != magic_ggla) {
fprintf(stderr, "%s: bad file magic\n", __func__); fprintf(stderr, "%s: bad file magic\n", __func__);
return 1; return 1;
} }
@ -2800,85 +2806,6 @@ size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * sr
return nread; return nread;
} }
bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_v2_file file(path_session, "rb");
// sanity checks
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) {
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return false;
}
llama_v2_hparams session_hparams;
file.read_raw(&session_hparams, sizeof(llama_v2_hparams));
if (session_hparams != ctx->model.hparams) {
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
return false;
}
}
// load the prompt
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false;
}
file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count);
*n_token_count_out = n_token_count;
}
// restore the context state
{
const size_t n_state_size_cur = file.size - file.tell();
const size_t n_state_size_max = llama_v2_get_state_size(ctx);
if (n_state_size_cur > n_state_size_max) {
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
return false;
}
std::vector<uint8_t> state_data(n_state_size_max);
file.read_raw(state_data.data(), n_state_size_cur);
llama_v2_set_state_data(ctx, state_data.data());
}
return true;
}
bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) {
llama_v2_file file(path_session, "wb");
file.write_u32(LLAMA_V2_SESSION_MAGIC);
file.write_u32(LLAMA_V2_SESSION_VERSION);
file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams));
// save the prompt
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count);
// save the context state
{
const size_t n_state_size_max = llama_v2_get_state_size(ctx);
std::vector<uint8_t> state_data(n_state_size_max);
const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data());
file.write_raw(state_data.data(), n_state_size_cur);
}
return true;
}
int llama_v2_eval( int llama_v2_eval(
struct llama_v2_context * ctx, struct llama_v2_context * ctx,
const llama_v2_token * tokens, const llama_v2_token * tokens,

View file

@ -140,10 +140,6 @@ extern "C" {
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src); LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src);
// Save/load session file
LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token. // Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process // tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls // n_past is the number of tokens to use from previous eval calls
@ -167,7 +163,7 @@ extern "C" {
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos); std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos);
LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx); LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx);

View file

@ -126,7 +126,7 @@ struct rwkv_v2_model {
// Finds model parameter by key and sets it into dest. // Finds model parameter by key and sets it into dest.
// If the parameter was not found, returns false. // If the parameter was not found, returns false.
bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, char * key, struct ggml_v2_tensor ** dest) { bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, const char * key, struct ggml_v2_tensor ** dest) {
struct ggml_v2_tensor * parameter = (*parameters)[key]; struct ggml_v2_tensor * parameter = (*parameters)[key];
RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key); RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key);
*dest = parameter; *dest = parameter;
@ -135,7 +135,7 @@ bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor
// Finds block parameter by block index and key and sets it into dest. // Finds block parameter by block index and key and sets it into dest.
// If the parameter was not found, returns false. // If the parameter was not found, returns false.
bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) { bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, const char * key, struct ggml_v2_tensor ** dest) {
char full_key[128]; char full_key[128];
sprintf(full_key, "blocks.%d.%s", block_index, key); sprintf(full_key, "blocks.%d.%s", block_index, key);
return rwkv_v2_set_parameter(parameters, full_key, dest); return rwkv_v2_set_parameter(parameters, full_key, dest);

View file

@ -112,28 +112,6 @@ static sd_ctx_t * sd_ctx = nullptr;
static int sddebugmode = 0; static int sddebugmode = 0;
static std::string recent_data = ""; static std::string recent_data = "";
std::string base64_encode(const unsigned char* data, unsigned int data_length) {
const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::string encoded;
encoded.reserve(((data_length + 2) / 3) * 4);
for (unsigned int i = 0; i < data_length; i += 3) {
unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
if (i + 1 < data_length) {
encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
} else {
encoded.push_back('=');
}
if (i + 2 < data_length) {
encoded.push_back(base64_chars[triple & 0x3F]);
} else {
encoded.push_back('=');
}
}
return encoded;
}
static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv; static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
static bool notiling = false; static bool notiling = false;
bool sdtype_load_model(const sd_load_model_inputs inputs) { bool sdtype_load_model(const sd_load_model_inputs inputs) {
@ -553,7 +531,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, ""); unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, "");
if (png != NULL) if (png != NULL)
{ {
recent_data = base64_encode(png,out_data_len); recent_data = kcpp_base64_encode(png,out_data_len);
free(png); free(png);
} }

672
otherarch/tts_adapter.cpp Normal file
View file

@ -0,0 +1,672 @@
#include "model_adapter.h"
#include "otherarch/utils.h"
#include "common.h"
#include "sampling.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <map>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include "src/llama-context.h"
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct wav_header {
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t chunk_size;
char wave[4] = {'W', 'A', 'V', 'E'};
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_chunk_size = 16;
uint16_t audio_format = 1; // PCM
uint16_t num_channels = 1; // Mono
uint32_t sample_rate;
uint32_t byte_rate;
uint16_t block_align;
uint16_t bits_per_sample = 16;
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size;
};
static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
std::ostringstream oss;
wav_header header;
// Fill header fields
header.sample_rate = sample_rate;
header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
header.block_align = header.num_channels * (header.bits_per_sample / 8);
header.data_size = data.size() * (header.bits_per_sample / 8);
header.chunk_size = 36 + header.data_size;
// Write header
oss.write(reinterpret_cast<const char*>(&header), sizeof(header));
// Write samples
for (const auto &sample : data) {
int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
oss.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
}
// Get binary WAV data
std::string wav_data = oss.str();
return kcpp_base64_encode(wav_data); //return as base64 string
}
static void fill_hann_window(int length, bool periodic, float * output) {
int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
// very poor-man fft
static void twiddle(float * real, float * imag, int k, int N) {
float angle = 2 * M_PI * k / N;
*real = cos(angle);
*imag = sin(angle);
}
static void irfft(int n, const float * inp_cplx, float * out_real) {
int N = n / 2 + 1;
std::vector<float> real_input(N);
std::vector<float> imag_input(N);
for (int i = 0; i < N; ++i) {
real_input[i] = inp_cplx[2 * i];
imag_input[i] = inp_cplx[2 * i + 1];
}
std::vector<float> real_output(n);
std::vector<float> imag_output(n);
for (int k = 0; k < n; ++k) {
real_output[k] = 0.0f;
imag_output[k] = 0.0f;
for (int m = 0; m < N; ++m) {
float twiddle_real;
float twiddle_imag;
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
}
}
for (int i = 0; i < n; ++i) {
out_real[i] = real_output[i] / N;
}
}
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
int64_t output_height = n_out;
int64_t kernel_w = n_win;
int64_t stride_w = n_hop;
int64_t width = n_out;
output.resize(width, 0.0f);
int64_t col_idx = 0;
for (int64_t w_col = 0; w_col < width; ++w_col) {
int64_t start = w_col * stride_w - n_pad;
int64_t end = start + kernel_w;
for (int64_t w_im = start; w_im < end; ++w_im) {
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
output[w_im] += data[col_idx];
}
col_idx++;
}
}
output.resize(n_out - 2 * n_pad);
}
// TODO: not optimized at all
static std::vector<float> embd_to_audio(
const float * embd,
const int n_codes,
const int n_embd,
const int n_thread) {
const int n_fft = 1280;
const int n_hop = 320;
const int n_win = 1280;
const int n_pad = (n_win - n_hop)/2;
const int n_out = (n_codes - 1)*n_hop + n_win;
std::vector<float> hann(n_fft);
fill_hann_window(hann.size(), true, hann.data());
int n_spec = n_embd*n_codes;
std::vector<float> E (n_spec);
std::vector<float> S (n_spec);
std::vector<float> ST(n_spec);
for (int l = 0; l < n_codes; ++l) {
for (int k = 0; k < n_embd; ++k) {
E[k*n_codes + l] = embd[l*n_embd + k];
}
}
for (int k = 0; k < n_embd/2; ++k) {
for (int l = 0; l < n_codes; ++l) {
float mag = E[(k )*n_codes + l];
float phi = E[(k + n_embd/2)*n_codes + l];
mag = exp(mag);
if (mag > 1e2) {
mag = 1e2;
}
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
}
}
for (int l = 0; l < n_codes; ++l) {
for (int k = 0; k < n_embd/2; ++k) {
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
}
}
std::vector<float> res (n_codes*n_fft);
std::vector<float> hann2(n_codes*n_fft);
std::vector<std::thread> workers(n_thread);
for (int i = 0; i < n_thread; ++i) {
workers[i] = std::thread([&, i]() {
for (int l = i; l < n_codes; l += n_thread) {
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
for (int j = 0; j < n_fft; ++j) {
res [l*n_fft + j] *= hann[j];
hann2[l*n_fft + j] = hann[j] * hann[j];
}
}
});
}
for (int i = 0; i < n_thread; ++i) {
workers[i].join();
}
std::vector<float> audio;
std::vector<float> env;
fold(res, n_out, n_win, n_hop, n_pad, audio);
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
for (size_t i = 0; i < audio.size(); ++i) {
audio[i] /= env[i];
}
return audio;
}
static const std::map<int, std::string> ones = {
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
{5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
{10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
{15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
};
static const std::map<int, std::string> tens = {
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
};
// Convert a number less than 1000 to words
static std::string convert_less_than_thousand(int num) {
std::string result;
if (num >= 100) {
result += ones.at(num / 100) + " hundred ";
num %= 100;
}
if (num >= 20) {
result += tens.at(num / 10);
if (num % 10 > 0) {
result += "-" + ones.at(num % 10);
}
} else if (num > 0) {
result += ones.at(num);
}
return result;
}
static std::string number_to_words(const std::string & number_str) {
try {
size_t decimal_pos = number_str.find('.');
std::string integer_part = number_str.substr(0, decimal_pos);
int int_number = std::stoi(integer_part);
std::string result;
if (int_number == 0) {
result = "zero";
} else {
if (int_number >= 1000000000) {
int billions = int_number / 1000000000;
result += convert_less_than_thousand(billions) + " billion ";
int_number %= 1000000000;
}
if (int_number >= 1000000) {
int millions = int_number / 1000000;
result += convert_less_than_thousand(millions) + " million ";
int_number %= 1000000;
}
if (int_number >= 1000) {
int thousands = int_number / 1000;
result += convert_less_than_thousand(thousands) + " thousand ";
int_number %= 1000;
}
if (int_number > 0) {
result += convert_less_than_thousand(int_number);
}
}
// Handle decimal part
if (decimal_pos != std::string::npos) {
result += " point";
std::string decimal_part = number_str.substr(decimal_pos + 1);
for (char digit : decimal_part) {
result += " " + ones.at(digit - '0');
}
}
return result;
} catch (const std::exception& e) {
// Skip if fails
return " ";
}
}
static std::string replace_numbers_with_words(const std::string & input_text) {
std::regex number_pattern(R"(\d+(\.\d+)?)");
std::string result;
auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
auto end = std::sregex_iterator();
size_t last_pos = 0;
for (std::sregex_iterator i = it; i != end; ++i) {
const std::smatch& match = *i;
result.append(input_text, last_pos, match.position() - last_pos);
result.append(number_to_words(match.str()));
last_pos = match.position() + match.length();
}
result.append(input_text, last_pos);
return result;
}
static std::string process_text(const std::string & text) {
std::string processed_text = replace_numbers_with_words(text);
std::transform(processed_text.begin(), processed_text.end(),
processed_text.begin(), ::tolower);
std::regex special_chars(R"([-_/,\.\\])");
processed_text = std::regex_replace(processed_text, special_chars, " ");
std::regex non_alpha(R"([^a-z\s])");
processed_text = std::regex_replace(processed_text, non_alpha, "");
std::regex multiple_spaces(R"(\s+)");
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
return processed_text;
}
static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
prompt.insert(prompt.end(), tokens.begin(), tokens.end());
}
static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
auto tmp = common_tokenize(model, txt, add_special, parse_special);
prompt_add(prompt, tmp);
}
static void prompt_init(llama_tokens & prompt, const llama_model * model) {
prompt.clear();
prompt_add(prompt, model, "<|im_start|>\n", true, true);
}
static std::vector<llama_token> prepare_guide_tokens(const llama_model * model, const std::string& str)
{
const std::string& delimiter = "<|text_sep|>";
std::vector<llama_token> result;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
std::string current_word = str.substr(start, end - start);
auto tmp = common_tokenize(model, current_word, false, true);
result.push_back(tmp[0]);
start = end + delimiter.length();
end = str.find(delimiter, start);
}
// Add the last part
std::string current_word = str.substr(start);
auto tmp = common_tokenize(model, current_word, false, true);
result.push_back(tmp[0]);
return result;
}
static llama_context * ttc_ctx = nullptr; //text to codes ctx
static llama_context * cts_ctx = nullptr; //codes to speech
static int ttsdebugmode = 0;
static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
static std::string last_generated_audio = "";
bool ttstype_load_model(const tts_load_model_inputs inputs)
{
//duplicated from expose.cpp
int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
putenv((char*)usingclblast.c_str());
cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
int platform = cl_parseinfo/10;
int devices = cl_parseinfo%10;
ttsplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
ttsdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
putenv((char*)ttsplatformenv.c_str());
putenv((char*)ttsdeviceenv.c_str());
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
if(vulkan_info_str!="")
{
ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)ttsvulkandeviceenv.c_str());
}
llama_backend_init();
std::string modelfile_ttc = inputs.ttc_model_filename;
std::string modelfile_cts = inputs.cts_model_filename;
printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
ttsdebugmode = inputs.debugmode;
// tts init
llama_model_params tts_model_params = llama_model_default_params();
llama_context_params tts_ctx_params = llama_context_default_params();
const int nthreads = 4;
tts_model_params.use_mmap = false;
tts_model_params.use_mlock = false;
tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
tts_ctx_params.n_ctx = 8192;
tts_ctx_params.logits_all = false;
tts_ctx_params.offload_kqv = true;
tts_ctx_params.n_batch = 8192;
tts_ctx_params.n_ubatch = 512;
tts_ctx_params.n_threads = nthreads;
tts_ctx_params.n_threads_batch = nthreads;
tts_ctx_params.flash_attn = false;
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
ttc_ctx = llama_new_context_with_model(ttcmodel, tts_ctx_params);
if (ttc_ctx == nullptr) {
printf("\nTTS Load Error: Failed to initialize ttc context!\n");
return false;
}
llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
tts_ctx_params.embeddings = true; //this requires embeddings instead
cts_ctx = llama_new_context_with_model(ctsmodel, tts_ctx_params);
if (cts_ctx == nullptr) {
printf("\nTTS Load Error: Failed to initialize cts context!\n");
return false;
}
std::vector<int> tmp = {1, 2, 3, 4};
llama_kv_cache_clear(ttc_ctx);
auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
if(er!=0)
{
printf("\nTTS Eval returned nonzero: %d\n",er);
return false;
}
printf("\nTTS Load Complete.\n");
return true;
}
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
{
tts_generation_outputs output;
if(ttc_ctx==nullptr || cts_ctx==nullptr)
{
printf("\nWarning: KCPP TTS not initialized!\n");
output.data = "";
output.status = 0;
return output;
}
std::vector<llama_token> codes;
std::vector<llama_token> guide_tokens;
const llama_model * model_ttc = &(ttc_ctx->model);
const llama_model * model_cts = &(cts_ctx->model);
const int ttc_n_vocab = llama_n_vocab(model_ttc);
std::string prompt = inputs.prompt;
if(!inputs.quiet)
{
printf("\nTTS Generating... ");
}
// process prompt and generate voice codes
std::vector<llama_token> prompt_inp;
prompt_init(prompt_inp, model_ttc);
prompt_add(prompt_inp, model_ttc, "<|text_start|>", false, true);
int speaker_seed = inputs.speaker_seed;
int audio_seed = inputs.audio_seed;
if (speaker_seed <= 0 || speaker_seed==0xFFFFFFFF)
{
speaker_seed = (((uint32_t)time(NULL)) % 1000000u);
if(ttsdebugmode==1)
{
printf("\nUsing Speaker Seed: %d", speaker_seed);
}
}
if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
{
audio_seed = (((uint32_t)time(NULL)) % 1000000u);
if(ttsdebugmode==1)
{
printf("\nUsing Audio Seed: %d", audio_seed);
}
}
std::mt19937 tts_rng(audio_seed);
std::mt19937 speaker_rng(speaker_seed);
//add the speaker based on the seed
if(speaker_seed>0)
{
std::string sampletext = "but<|text_sep|>that<|text_sep|>is<|text_sep|>what<|text_sep|>it<|text_sep|>is<|text_sep|>";
}
// convert the input text into the necessary format expected by OuteTTS
std::string prompt_clean = process_text(prompt);
if(prompt_clean.size()==0)
{
//no input
if(!inputs.quiet)
{
printf("\nTTS sent empty input.\n");
output.data = "";
output.status = 1;
return output;
}
}
if(!inputs.quiet && ttsdebugmode==1)
{
printf("\nInput: %s\n", prompt_clean.c_str());
}
guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean);
prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
if(!inputs.quiet)
{
printf(" (%d input words)...", guide_tokens.size());
}
prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
//create batch with tokens for decoding prompt processing
llama_kv_cache_clear(ttc_ctx);
llama_kv_cache_clear(cts_ctx);
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
if (!evalok) {
printf("\nError: TTS prompt batch processing failed\n");
output.data = "";
output.status = 0;
return output;
}
// main loop
int n_decode = 0;
int n_predict = 4096; //max 4096 tokens
bool next_token_uses_guide_token = true;
while (n_decode <= n_predict)
{
float * logits = llama_get_logits(ttc_ctx);
llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,20,1.0,tts_rng);
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id))
{
llama_token guide_token = guide_tokens[0];
guide_tokens.erase(guide_tokens.begin());
new_token_id = guide_token; //ensure correct word fragment is used
}
//this is the token id that always precedes a new word
next_token_uses_guide_token = (new_token_id == 198);
codes.push_back(new_token_id);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model_ttc, new_token_id) || n_decode >= n_predict) {
break;
}
n_decode += 1;
std::vector<llama_token> next = {new_token_id};
llama_batch batch = llama_batch_get_one(next.data(), next.size());
// evaluate the current batch with the transformer model
if (llama_decode(ttc_ctx, batch)) {
printf("\nError: TTS code generation failed!\n");
output.data = "";
output.status = 0;
return output;
}
}
if(!inputs.quiet && ttsdebugmode==1)
{
const std::string inp_txt = common_detokenize(ttc_ctx, codes, true);
printf("\nGenerated %d Codes: '%s'\n",codes.size(), inp_txt.c_str());
}
// remove all non-audio tokens (i.e. < 151672 || > 155772)
codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
for (auto & token : codes) {
token -= 151672;
}
const int n_codes = codes.size();
if(n_codes<=1)
{
printf("\nWarning: TTS vocoder generated nothing!\n");
output.data = "";
output.status = 0;
return output;
}
kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
if (llama_decode(cts_ctx, codebatch.batch) != 0) {
printf("\nError: TTS vocoder generation failed!\n");
output.data = "";
output.status = 0;
return output;
}
else
{
// spectral operations
const int n_embd = llama_n_embd(model_cts);
const float * embd = llama_get_embeddings(cts_ctx);
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
const int n_sr = 24000; // sampling rate
// zero out first 0.05 seconds
for (int i = 0; i < 24000/20; ++i) {
audio[i] = 0.0f;
}
//add some silence at the end
for (int i = 0; i < 24000/20; ++i) {
audio.push_back(0.0f);
}
last_generated_audio = save_wav16_base64(audio, n_sr);
if(!inputs.quiet)
{
printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
}
output.data = last_generated_audio.c_str();
output.status = 1;
return output;
}
}

View file

@ -1,5 +1,6 @@
#include "utils.h" #include "utils.h"
#include "common.h" #include "common.h"
#include "llama.h"
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
@ -303,6 +304,47 @@ std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string)
return ret; return ret;
} }
std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length) {
const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::string encoded;
encoded.reserve(((data_length + 2) / 3) * 4);
for (unsigned int i = 0; i < data_length; i += 3) {
unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
if (i + 1 < data_length) {
encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
} else {
encoded.push_back('=');
}
if (i + 2 < data_length) {
encoded.push_back(base64_chars[triple & 0x3F]);
} else {
encoded.push_back('=');
}
}
return encoded;
}
std::string kcpp_base64_encode(const std::string &data) {
static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::string encoded;
int val = 0, valb = -6;
for (unsigned char c : data) {
val = (val << 8) + c;
valb += 8;
while (valb >= 0) {
encoded.push_back(lookup[(val >> valb) & 0x3F]);
valb -= 6;
}
}
if (valb > -6) {
encoded.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]);
}
while (encoded.size() % 4) {
encoded.push_back('=');
}
return encoded;
}
std::string get_timestamp_str() std::string get_timestamp_str()
{ {
@ -314,3 +356,150 @@ std::string get_timestamp_str()
std::string timestamp(buffer); std::string timestamp(buffer);
return timestamp; return timestamp;
} }
//a very rudimentary all in one sampling function which has no dependencies
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
{
if (temp <= 0 || top_k==1) {
// select the token with the highest logit directly
float max_logit = logits[0];
int32_t max_id = 0;
for (int i = 1; i < n_logits; ++i) {
if (logits[i] > max_logit) {
max_logit = logits[i];
max_id = i;
}
}
return max_id;
}
top_k = (top_k<=0 || top_k>300)?300:top_k;
top_k = std::min(top_k, n_logits);
std::vector<std::pair<float, int32_t>> logits_id;
logits_id.reserve(n_logits);
//temperature sample
const float scale = 1.0f/temp;
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
//sample top_k
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
// compute probs for the top k tokens
std::vector<float> probs;
probs.reserve(logits_id.size());
float maxl = logits_id[0].first;
double sum = 0.0;
for (const auto & kv : logits_id) {
const float p = expf(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope)
{
int32_t seq_id = 0;
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
}
kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
{
int32_t seq_id = 0;
int32_t n_tokens = tokens.size();
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids[n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens.data(),
/*embd =*/ nullptr,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
batch.logits[n_tokens - 1] = true;
}

View file

@ -8,6 +8,7 @@
#include <random> #include <random>
#include <thread> #include <thread>
#include "ggml_v3.h" #include "ggml_v3.h"
#include "llama.h"
// //
// CLI argument parsing // CLI argument parsing
@ -52,10 +53,23 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
// //
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text); std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
bool should_transpose_layer(std::string name); bool should_transpose_layer(std::string name);
void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads); void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);
std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string); std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length);
std::string kcpp_base64_encode(const std::string &data);
std::string get_timestamp_str(); std::string get_timestamp_str();
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
struct kcpp_embd_batch { //duplcated from llava_embd_batch
std::vector<int32_t> pos;
std::vector<int32_t> n_seq_id;
std::vector<int32_t> seq_id_0;
std::vector<int32_t *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope);
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits);
};