mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
Fixed some GGUFv1 loading bugs, long overdue cleanup for compiling, integrated TTS
tts is functional (+6 squashed commit) Squashed commit: [22396311] wip tts [3a883027] tts not yet working [0dcfab0e] fix silly bug [a378d9ef] some long overdue cleanup [fc5a6fb5] Wip tts [39f50497] wip TTS integration
This commit is contained in:
parent
12cdcf0abe
commit
b3de1598e7
17 changed files with 1175 additions and 271 deletions
|
@ -495,7 +495,9 @@ add_library(common2
|
||||||
examples/llava/clip.h
|
examples/llava/clip.h
|
||||||
src/unicode.h
|
src/unicode.h
|
||||||
src/unicode.cpp
|
src/unicode.cpp
|
||||||
src/unicode-data.cpp)
|
src/unicode-data.cpp
|
||||||
|
otherarch/utils.cpp
|
||||||
|
otherarch/utils.h)
|
||||||
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
|
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
|
||||||
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
||||||
|
@ -515,11 +517,18 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
add_library(tts_adapter
|
||||||
|
otherarch/tts_adapter.cpp)
|
||||||
|
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
|
||||||
|
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
|
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
|
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(gpttype_adapter
|
add_library(gpttype_adapter
|
||||||
gpttype_adapter.cpp)
|
gpttype_adapter.cpp)
|
||||||
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
|
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
|
||||||
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
|
@ -530,8 +539,16 @@ if (LLAMA_CUBLAS)
|
||||||
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
TARGET koboldcpp_cublas POST_BUILD
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
|
$<TARGET_FILE:koboldcpp_cublas> # The generated DLL
|
||||||
|
${CMAKE_SOURCE_DIR}/ # Destination directory
|
||||||
|
COMMENT "Copying DLL to parent directory"
|
||||||
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
|
@ -542,7 +559,15 @@ if (LLAMA_HIPBLAS)
|
||||||
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
TARGET koboldcpp_hipblas POST_BUILD
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
|
$<TARGET_FILE:koboldcpp_hipblas> # The generated DLL
|
||||||
|
${CMAKE_SOURCE_DIR}/ # Destination directory
|
||||||
|
COMMENT "Copying DLL to parent directory"
|
||||||
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
36
Makefile
36
Makefile
|
@ -4,7 +4,7 @@
|
||||||
.PHONY: finishedmsg
|
.PHONY: finishedmsg
|
||||||
|
|
||||||
default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
|
default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
|
||||||
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
|
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip ttsmain whispermain sdmain gguf-split
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
|
@ -90,10 +90,10 @@ endif
|
||||||
CUBLASLD_FLAGS =
|
CUBLASLD_FLAGS =
|
||||||
CUBLAS_OBJS =
|
CUBLAS_OBJS =
|
||||||
|
|
||||||
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o
|
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o
|
||||||
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o
|
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o
|
||||||
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o
|
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o
|
||||||
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o
|
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
|
@ -539,6 +539,8 @@ ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
|
gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
kcpputils.o: otherarch/utils.cpp otherarch/utils.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#these have special gpu defines
|
#these have special gpu defines
|
||||||
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
||||||
|
@ -639,8 +641,12 @@ whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
|
||||||
whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
|
whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
|
||||||
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
#tts objects
|
||||||
|
tts_default.o: otherarch/tts_adapter.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
# idiotic "for easier compilation"
|
# idiotic "for easier compilation"
|
||||||
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
|
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
|
||||||
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
|
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
|
||||||
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
|
||||||
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
|
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
|
||||||
|
@ -680,11 +686,11 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/ggml-vulkan/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
|
|
||||||
#generated libraries
|
#generated libraries
|
||||||
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(DEFAULT_BUILD)
|
$(DEFAULT_BUILD)
|
||||||
|
|
||||||
ifdef FAILSAFE_BUILD
|
ifdef FAILSAFE_BUILD
|
||||||
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
|
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
|
||||||
$(FAILSAFE_BUILD)
|
$(FAILSAFE_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_failsafe:
|
koboldcpp_failsafe:
|
||||||
|
@ -692,7 +698,7 @@ koboldcpp_failsafe:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(NOAVX2_BUILD)
|
$(NOAVX2_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_noavx2:
|
koboldcpp_noavx2:
|
||||||
|
@ -700,10 +706,10 @@ koboldcpp_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CLBLAST_BUILD
|
ifdef CLBLAST_BUILD
|
||||||
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
|
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o tts_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_clblast_noavx2:
|
koboldcpp_clblast_noavx2:
|
||||||
|
@ -717,7 +723,7 @@ koboldcpp_clblast_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CUBLAS_BUILD
|
ifdef CUBLAS_BUILD
|
||||||
koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(CUBLAS_BUILD)
|
$(CUBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_cublas:
|
koboldcpp_cublas:
|
||||||
|
@ -725,7 +731,7 @@ koboldcpp_cublas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef HIPBLAS_BUILD
|
ifdef HIPBLAS_BUILD
|
||||||
koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(HIPBLAS_BUILD)
|
$(HIPBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_hipblas:
|
koboldcpp_hipblas:
|
||||||
|
@ -733,10 +739,10 @@ koboldcpp_hipblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef VULKAN_BUILD
|
ifdef VULKAN_BUILD
|
||||||
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o tts_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_vulkan_noavx2:
|
koboldcpp_vulkan_noavx2:
|
||||||
|
|
|
@ -238,6 +238,15 @@ extern "C"
|
||||||
return whispertype_generate(inputs);
|
return whispertype_generate(inputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool tts_load_model(const tts_load_model_inputs inputs)
|
||||||
|
{
|
||||||
|
return ttstype_load_model(inputs);
|
||||||
|
}
|
||||||
|
tts_generation_outputs tts_generate(const tts_generation_inputs inputs)
|
||||||
|
{
|
||||||
|
return ttstype_generate(inputs);
|
||||||
|
}
|
||||||
|
|
||||||
const char * new_token(int idx) {
|
const char * new_token(int idx) {
|
||||||
if (generated_tokens.size() <= idx || idx < 0) return nullptr;
|
if (generated_tokens.size() <= idx || idx < 0) return nullptr;
|
||||||
|
|
||||||
|
|
26
expose.h
26
expose.h
|
@ -139,6 +139,7 @@ struct last_logprobs_outputs {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
logprob_item * logprob_items = nullptr;
|
logprob_item * logprob_items = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct sd_load_model_inputs
|
struct sd_load_model_inputs
|
||||||
{
|
{
|
||||||
const char * model_filename = nullptr;
|
const char * model_filename = nullptr;
|
||||||
|
@ -178,6 +179,7 @@ struct sd_generation_outputs
|
||||||
int status = -1;
|
int status = -1;
|
||||||
const char * data = "";
|
const char * data = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
struct whisper_load_model_inputs
|
struct whisper_load_model_inputs
|
||||||
{
|
{
|
||||||
const char * model_filename = nullptr;
|
const char * model_filename = nullptr;
|
||||||
|
@ -201,6 +203,30 @@ struct whisper_generation_outputs
|
||||||
const char * text = "";
|
const char * text = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct tts_load_model_inputs
|
||||||
|
{
|
||||||
|
const char * ttc_model_filename = nullptr;
|
||||||
|
const char * cts_model_filename = nullptr;
|
||||||
|
const char * executable_path = nullptr;
|
||||||
|
const int clblast_info = 0;
|
||||||
|
const int cublas_info = 0;
|
||||||
|
const char * vulkan_info = nullptr;
|
||||||
|
const int gpulayers = 0;
|
||||||
|
const int debugmode = 0;
|
||||||
|
};
|
||||||
|
struct tts_generation_inputs
|
||||||
|
{
|
||||||
|
const char * prompt = nullptr;
|
||||||
|
const int speaker_seed = 0;
|
||||||
|
const int audio_seed = 0;
|
||||||
|
const bool quiet = false;
|
||||||
|
};
|
||||||
|
struct tts_generation_outputs
|
||||||
|
{
|
||||||
|
int status = -1;
|
||||||
|
const char * data = "";
|
||||||
|
};
|
||||||
|
|
||||||
extern std::string executable_path;
|
extern std::string executable_path;
|
||||||
extern std::string lora_filename;
|
extern std::string lora_filename;
|
||||||
extern std::string lora_base;
|
extern std::string lora_base;
|
||||||
|
|
|
@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ok && gr.read(n_kv_32)) {
|
if (ok && gr.read(n_kv_32)) {
|
||||||
n_kv_32 = n_kv_32;
|
n_kv = n_kv_32;
|
||||||
} else {
|
} else {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,12 +21,13 @@
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
//for easier compilation
|
//for easier compilation
|
||||||
//concat source files into one file for compilation purposes
|
//concat source files into one file for compilation purposes
|
||||||
#include "llama_v2.cpp"
|
#include "llama_v2.cpp"
|
||||||
#include "llama_v3.cpp"
|
#include "llama_v3.cpp"
|
||||||
#include "src/llama.cpp"
|
#include "src/llama.cpp"
|
||||||
#include "utils.cpp"
|
|
||||||
#include "gptj_v1.cpp"
|
#include "gptj_v1.cpp"
|
||||||
#include "gptj_v2.cpp"
|
#include "gptj_v2.cpp"
|
||||||
#include "gptj_v3.cpp"
|
#include "gptj_v3.cpp"
|
||||||
|
@ -535,99 +536,6 @@ const char * kcpp_print_system_info(void) {
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
struct kcpp_embd_batch { //duplcated from llava_embd_batch
|
|
||||||
std::vector<int32_t> pos;
|
|
||||||
std::vector<int32_t> n_seq_id;
|
|
||||||
std::vector<int32_t> seq_id_0;
|
|
||||||
std::vector<int32_t *> seq_ids;
|
|
||||||
std::vector<int8_t> logits;
|
|
||||||
llama_batch batch;
|
|
||||||
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
|
|
||||||
int32_t seq_id = 0;
|
|
||||||
pos.resize(n_tokens * (use_mrope?4:1));
|
|
||||||
std::fill(pos.begin(), pos.end(), 0);
|
|
||||||
n_seq_id.resize(n_tokens);
|
|
||||||
seq_ids.resize(n_tokens + 1);
|
|
||||||
logits.resize(n_tokens);
|
|
||||||
seq_id_0.resize(1);
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
seq_ids [n_tokens] = nullptr;
|
|
||||||
batch = {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ nullptr,
|
|
||||||
/*embd =*/ embd,
|
|
||||||
/*pos =*/ pos.data(),
|
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
|
||||||
/*seq_id =*/ seq_ids.data(),
|
|
||||||
/*logits =*/ logits.data(),
|
|
||||||
};
|
|
||||||
|
|
||||||
if(!use_mrope)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.pos [i] = npast + i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
|
||||||
batch.pos[j] = npast + (j % batch.n_tokens);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
|
|
||||||
int32_t seq_id = 0;
|
|
||||||
int32_t n_tokens = tokens.size();
|
|
||||||
pos.resize(n_tokens * (use_mrope?4:1));
|
|
||||||
std::fill(pos.begin(), pos.end(), 0);
|
|
||||||
n_seq_id.resize(n_tokens);
|
|
||||||
seq_ids.resize(n_tokens + 1);
|
|
||||||
logits.resize(n_tokens);
|
|
||||||
seq_id_0.resize(1);
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
seq_ids[n_tokens] = nullptr;
|
|
||||||
batch = {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ tokens.data(),
|
|
||||||
/*embd =*/ nullptr,
|
|
||||||
/*pos =*/ pos.data(),
|
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
|
||||||
/*seq_id =*/ seq_ids.data(),
|
|
||||||
/*logits =*/ logits.data(),
|
|
||||||
};
|
|
||||||
|
|
||||||
if(!use_mrope)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.pos [i] = npast + i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = (return_all_logits?true:false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = (return_all_logits?true:false);
|
|
||||||
}
|
|
||||||
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
|
||||||
batch.pos[j] = npast + (j % batch.n_tokens);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
batch.logits[n_tokens - 1] = true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
//loads a model for speculative decoding.
|
//loads a model for speculative decoding.
|
||||||
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
|
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
|
||||||
{
|
{
|
||||||
|
@ -664,7 +572,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
|
||||||
draft_ctx_params.type_k = base_ctx_params.type_k;
|
draft_ctx_params.type_k = base_ctx_params.type_k;
|
||||||
draft_ctx_params.type_v = base_ctx_params.type_v;
|
draft_ctx_params.type_v = base_ctx_params.type_v;
|
||||||
|
|
||||||
llama_model * draftmodel = llama_load_model_from_file(spec_model_filename.c_str(), draft_model_params);
|
llama_model * draftmodel = llama_model_load_from_file(spec_model_filename.c_str(), draft_model_params);
|
||||||
draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params);
|
draft_ctx = llama_new_context_with_model(draftmodel, draft_ctx_params);
|
||||||
if(draft_ctx == NULL)
|
if(draft_ctx == NULL)
|
||||||
{
|
{
|
||||||
|
@ -2252,7 +2160,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
kvos.push_back(kvo);
|
kvos.push_back(kvo);
|
||||||
model_params.kv_overrides = kvos.data();
|
model_params.kv_overrides = kvos.data();
|
||||||
}
|
}
|
||||||
llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
|
llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
|
||||||
|
|
||||||
if(overwriteRope)
|
if(overwriteRope)
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
#ifndef LLAMA_H
|
#ifndef LLAMA_H
|
||||||
#define LLAMA_H
|
#define LLAMA_H
|
||||||
|
|
||||||
|
|
214
koboldcpp.py
214
koboldcpp.py
|
@ -53,6 +53,7 @@ fullsdmodelpath = "" #if empty, it's not initialized
|
||||||
mmprojpath = "" #if empty, it's not initialized
|
mmprojpath = "" #if empty, it's not initialized
|
||||||
password = "" #if empty, no auth key required
|
password = "" #if empty, no auth key required
|
||||||
fullwhispermodelpath = "" #if empty, it's not initialized
|
fullwhispermodelpath = "" #if empty, it's not initialized
|
||||||
|
ttsmodelpath = "" #if empty, not initialized
|
||||||
maxctx = 4096
|
maxctx = 4096
|
||||||
maxhordectx = 4096
|
maxhordectx = 4096
|
||||||
maxhordelen = 400
|
maxhordelen = 400
|
||||||
|
@ -281,6 +282,26 @@ class whisper_generation_outputs(ctypes.Structure):
|
||||||
_fields_ = [("status", ctypes.c_int),
|
_fields_ = [("status", ctypes.c_int),
|
||||||
("data", ctypes.c_char_p)]
|
("data", ctypes.c_char_p)]
|
||||||
|
|
||||||
|
class tts_load_model_inputs(ctypes.Structure):
|
||||||
|
_fields_ = [("ttc_model_filename", ctypes.c_char_p),
|
||||||
|
("cts_model_filename", ctypes.c_char_p),
|
||||||
|
("executable_path", ctypes.c_char_p),
|
||||||
|
("clblast_info", ctypes.c_int),
|
||||||
|
("cublas_info", ctypes.c_int),
|
||||||
|
("vulkan_info", ctypes.c_char_p),
|
||||||
|
("gpulayers", ctypes.c_int),
|
||||||
|
("debugmode", ctypes.c_int)]
|
||||||
|
|
||||||
|
class tts_generation_inputs(ctypes.Structure):
|
||||||
|
_fields_ = [("prompt", ctypes.c_char_p),
|
||||||
|
("speaker_seed", ctypes.c_int),
|
||||||
|
("audio_seed", ctypes.c_int),
|
||||||
|
("quiet", ctypes.c_bool)]
|
||||||
|
|
||||||
|
class tts_generation_outputs(ctypes.Structure):
|
||||||
|
_fields_ = [("status", ctypes.c_int),
|
||||||
|
("data", ctypes.c_char_p)]
|
||||||
|
|
||||||
def getdirpath():
|
def getdirpath():
|
||||||
return os.path.dirname(os.path.realpath(__file__))
|
return os.path.dirname(os.path.realpath(__file__))
|
||||||
def getabspath():
|
def getabspath():
|
||||||
|
@ -440,6 +461,10 @@ def init_library():
|
||||||
handle.whisper_load_model.restype = ctypes.c_bool
|
handle.whisper_load_model.restype = ctypes.c_bool
|
||||||
handle.whisper_generate.argtypes = [whisper_generation_inputs]
|
handle.whisper_generate.argtypes = [whisper_generation_inputs]
|
||||||
handle.whisper_generate.restype = whisper_generation_outputs
|
handle.whisper_generate.restype = whisper_generation_outputs
|
||||||
|
handle.tts_load_model.argtypes = [tts_load_model_inputs]
|
||||||
|
handle.tts_load_model.restype = ctypes.c_bool
|
||||||
|
handle.tts_generate.argtypes = [tts_generation_inputs]
|
||||||
|
handle.tts_generate.restype = tts_generation_outputs
|
||||||
handle.last_logprobs.restype = last_logprobs_outputs
|
handle.last_logprobs.restype = last_logprobs_outputs
|
||||||
handle.detokenize.argtypes = [token_count_outputs]
|
handle.detokenize.argtypes = [token_count_outputs]
|
||||||
handle.detokenize.restype = ctypes.c_char_p
|
handle.detokenize.restype = ctypes.c_char_p
|
||||||
|
@ -577,9 +602,13 @@ def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = al
|
||||||
maxlen = 32000
|
maxlen = 32000
|
||||||
if args.debugmode >= 1:
|
if args.debugmode >= 1:
|
||||||
maxlen = 64000
|
maxlen = 64000
|
||||||
strlength = len(str)
|
try:
|
||||||
if strlength > maxlen: #limit max output len
|
strlength = len(str)
|
||||||
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
|
if strlength > maxlen: #limit max output len
|
||||||
|
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(str)
|
print(str)
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
|
@ -647,13 +676,14 @@ def read_gguf_metadata(file_path):
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath):
|
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath):
|
||||||
global modelfile_extracted_meta
|
global modelfile_extracted_meta
|
||||||
modelfile_extracted_meta = None
|
modelfile_extracted_meta = None
|
||||||
sdfsize = 0
|
sdfsize = 0
|
||||||
whisperfsize = 0
|
whisperfsize = 0
|
||||||
mmprojsize = 0
|
mmprojsize = 0
|
||||||
draftmodelsize = 0
|
draftmodelsize = 0
|
||||||
|
ttsmodelsize = 0
|
||||||
if sdfilepath and os.path.exists(sdfilepath):
|
if sdfilepath and os.path.exists(sdfilepath):
|
||||||
sdfsize = os.path.getsize(sdfilepath)
|
sdfsize = os.path.getsize(sdfilepath)
|
||||||
if whisperfilepath and os.path.exists(whisperfilepath):
|
if whisperfilepath and os.path.exists(whisperfilepath):
|
||||||
|
@ -662,12 +692,14 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
|
||||||
mmprojsize = os.path.getsize(mmprojfilepath)
|
mmprojsize = os.path.getsize(mmprojfilepath)
|
||||||
if draftmodelpath and os.path.exists(draftmodelpath):
|
if draftmodelpath and os.path.exists(draftmodelpath):
|
||||||
draftmodelsize = os.path.getsize(draftmodelpath)
|
draftmodelsize = os.path.getsize(draftmodelpath)
|
||||||
|
if ttsmodelpath and os.path.exists(ttsmodelpath):
|
||||||
|
ttsmodelsize = os.path.getsize(ttsmodelpath)
|
||||||
if filepath and os.path.exists(filepath):
|
if filepath and os.path.exists(filepath):
|
||||||
try:
|
try:
|
||||||
fsize = os.path.getsize(filepath)
|
fsize = os.path.getsize(filepath)
|
||||||
if fsize>10000000: #dont bother with models < 10mb as they are probably bad
|
if fsize>10000000: #dont bother with models < 10mb as they are probably bad
|
||||||
ggufmeta = read_gguf_metadata(filepath)
|
ggufmeta = read_gguf_metadata(filepath)
|
||||||
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize] #extract done. note that meta may be null
|
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize,draftmodelsize,ttsmodelsize] #extract done. note that meta may be null
|
||||||
except Exception:
|
except Exception:
|
||||||
modelfile_extracted_meta = None
|
modelfile_extracted_meta = None
|
||||||
|
|
||||||
|
@ -699,6 +731,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how man
|
||||||
mem -= 350*1024*1024
|
mem -= 350*1024*1024
|
||||||
if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
|
if modelfile_extracted_meta[5] > 1024*1024*10: #draft model tax
|
||||||
mem -= (modelfile_extracted_meta[5] * 1.5)
|
mem -= (modelfile_extracted_meta[5] * 1.5)
|
||||||
|
if modelfile_extracted_meta[6] > 1024*1024*10: #tts model tax
|
||||||
|
mem -= max(600*1024*1024, modelfile_extracted_meta[6] * 3)
|
||||||
mem = 0 if mem < 0 else mem
|
mem = 0 if mem < 0 else mem
|
||||||
|
|
||||||
csmul = 1.0
|
csmul = 1.0
|
||||||
|
@ -730,6 +764,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
FetchedCUdevices = []
|
FetchedCUdevices = []
|
||||||
FetchedCUdeviceMem = []
|
FetchedCUdeviceMem = []
|
||||||
FetchedCUfreeMem = []
|
FetchedCUfreeMem = []
|
||||||
|
faileddetectvram = False
|
||||||
|
|
||||||
AMDgpu = None
|
AMDgpu = None
|
||||||
try: # Get NVIDIA GPU names
|
try: # Get NVIDIA GPU names
|
||||||
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
|
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
|
||||||
|
@ -737,6 +773,10 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
|
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
|
||||||
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
|
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
|
||||||
except Exception:
|
except Exception:
|
||||||
|
FetchedCUdevices = []
|
||||||
|
FetchedCUdeviceMem = []
|
||||||
|
FetchedCUfreeMem = []
|
||||||
|
faileddetectvram = True
|
||||||
pass
|
pass
|
||||||
if len(FetchedCUdevices)==0:
|
if len(FetchedCUdevices)==0:
|
||||||
try: # Get AMD ROCm GPU names
|
try: # Get AMD ROCm GPU names
|
||||||
|
@ -756,18 +796,30 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
if getamdvram:
|
if getamdvram:
|
||||||
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
|
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
|
||||||
except Exception:
|
except Exception:
|
||||||
|
FetchedCUdevices = []
|
||||||
|
FetchedCUdeviceMem = []
|
||||||
|
FetchedCUfreeMem = []
|
||||||
|
faileddetectvram = True
|
||||||
pass
|
pass
|
||||||
lowestcumem = 0
|
lowestcumem = 0
|
||||||
lowestfreecumem = 0
|
lowestfreecumem = 0
|
||||||
for idx in range(0,4):
|
try:
|
||||||
if(len(FetchedCUdevices)>idx):
|
for idx in range(0,4):
|
||||||
CUDevicesNames[idx] = FetchedCUdevices[idx]
|
if(len(FetchedCUdevices)>idx):
|
||||||
if len(FetchedCUdeviceMem)>idx:
|
CUDevicesNames[idx] = FetchedCUdevices[idx]
|
||||||
dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
|
if len(FetchedCUdeviceMem)>idx:
|
||||||
lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
|
dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
|
||||||
if len(FetchedCUfreeMem)>idx:
|
lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
|
||||||
dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
|
if len(FetchedCUfreeMem)>idx:
|
||||||
lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
|
dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
|
||||||
|
lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
|
||||||
|
except Exception:
|
||||||
|
lowestcumem = 0
|
||||||
|
lowestfreecumem = 0
|
||||||
|
faileddetectvram = True
|
||||||
|
|
||||||
|
if faileddetectvram:
|
||||||
|
print("Unable to detect VRAM, please set layers manually.")
|
||||||
|
|
||||||
MaxMemory[0] = max(lowestcumem,MaxMemory[0])
|
MaxMemory[0] = max(lowestcumem,MaxMemory[0])
|
||||||
MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
|
MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
|
||||||
|
@ -1264,6 +1316,34 @@ def whisper_generate(genparams):
|
||||||
outstr = ret.data.decode("UTF-8","ignore")
|
outstr = ret.data.decode("UTF-8","ignore")
|
||||||
return outstr
|
return outstr
|
||||||
|
|
||||||
|
def tts_load_model(ttc_model_filename,cts_model_filename):
|
||||||
|
global args
|
||||||
|
inputs = tts_load_model_inputs()
|
||||||
|
inputs.debugmode = args.debugmode
|
||||||
|
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
||||||
|
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
|
||||||
|
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
|
||||||
|
inputs.gpulayers = (999 if args.ttsgpu else 0)
|
||||||
|
inputs = set_backend_props(inputs)
|
||||||
|
ret = handle.tts_load_model(inputs)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def tts_generate(genparams):
|
||||||
|
global args
|
||||||
|
is_quiet = True if (args.quiet or args.debugmode == -1) else False
|
||||||
|
prompt = genparams.get("input", "")
|
||||||
|
prompt = prompt.strip()
|
||||||
|
inputs = tts_generation_inputs()
|
||||||
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
|
inputs.speaker_seed = 0
|
||||||
|
inputs.audio_seed = 0
|
||||||
|
inputs.quiet = is_quiet
|
||||||
|
ret = handle.tts_generate(inputs)
|
||||||
|
outstr = ""
|
||||||
|
if ret.status==1:
|
||||||
|
outstr = ret.data.decode("UTF-8","ignore")
|
||||||
|
return outstr
|
||||||
|
|
||||||
def tokenize_ids(countprompt,tcaddspecial):
|
def tokenize_ids(countprompt,tcaddspecial):
|
||||||
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
||||||
countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
|
countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
|
||||||
|
@ -1738,10 +1818,11 @@ def LaunchWebbrowser(target_url, failedmsg):
|
||||||
try:
|
try:
|
||||||
import webbrowser as wb
|
import webbrowser as wb
|
||||||
if wb.open(target_url, autoraise=True):
|
if wb.open(target_url, autoraise=True):
|
||||||
return
|
return
|
||||||
raise RuntimeError("Cannot open default browser")
|
raise RuntimeError("Cannot open default browser")
|
||||||
except Exception:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
|
print(f"Browser failed to launch: {e}, attempting to use xdg-open...")
|
||||||
import webbrowser as wb
|
import webbrowser as wb
|
||||||
if wb.get('xdg-open').open(target_url, autoraise=True):
|
if wb.get('xdg-open').open(target_url, autoraise=True):
|
||||||
return
|
return
|
||||||
|
@ -2102,7 +2183,7 @@ Enter Prompt:<br>
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
|
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
|
||||||
global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
|
global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
|
||||||
self.path = self.path.rstrip('/')
|
self.path = self.path.rstrip('/')
|
||||||
response_body = None
|
response_body = None
|
||||||
content_type = 'application/json'
|
content_type = 'application/json'
|
||||||
|
@ -2160,7 +2241,8 @@ Enter Prompt:<br>
|
||||||
has_password = (password!="")
|
has_password = (password!="")
|
||||||
has_whisper = (fullwhispermodelpath!="")
|
has_whisper = (fullwhispermodelpath!="")
|
||||||
has_search = True if args.websearch else False
|
has_search = True if args.websearch else False
|
||||||
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search}).encode())
|
has_tts = (ttsmodelpath!="")
|
||||||
|
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion, "protected":has_password ,"txt2img":has_txt2img,"vision":has_vision,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts}).encode())
|
||||||
|
|
||||||
elif self.path.endswith(('/api/extra/perf')):
|
elif self.path.endswith(('/api/extra/perf')):
|
||||||
global last_req_time, start_time
|
global last_req_time, start_time
|
||||||
|
@ -2521,7 +2603,7 @@ Enter Prompt:<br>
|
||||||
|
|
||||||
reqblocking = False
|
reqblocking = False
|
||||||
muint = int(args.multiuser)
|
muint = int(args.multiuser)
|
||||||
if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="")):
|
if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="")):
|
||||||
muint = 2 # this prevents errors when using voice/img together with text
|
muint = 2 # this prevents errors when using voice/img together with text
|
||||||
multiuserlimit = ((muint-1) if muint > 1 else 6)
|
multiuserlimit = ((muint-1) if muint > 1 else 6)
|
||||||
#backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1
|
#backwards compatibility for up to 7 concurrent requests, use default limit of 7 if multiuser set to 1
|
||||||
|
@ -2546,6 +2628,7 @@ Enter Prompt:<br>
|
||||||
is_imggen = False
|
is_imggen = False
|
||||||
is_comfyui_imggen = False
|
is_comfyui_imggen = False
|
||||||
is_transcribe = False
|
is_transcribe = False
|
||||||
|
is_tts = False
|
||||||
|
|
||||||
if self.path.endswith('/request'):
|
if self.path.endswith('/request'):
|
||||||
api_format = 1
|
api_format = 1
|
||||||
|
@ -2588,11 +2671,14 @@ Enter Prompt:<br>
|
||||||
if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
|
if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
|
||||||
is_transcribe = True
|
is_transcribe = True
|
||||||
|
|
||||||
if is_imggen or is_transcribe or api_format > 0:
|
if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'):
|
||||||
|
is_tts = True
|
||||||
|
|
||||||
|
if is_imggen or is_transcribe or is_tts or api_format > 0:
|
||||||
global last_req_time
|
global last_req_time
|
||||||
last_req_time = time.time()
|
last_req_time = time.time()
|
||||||
|
|
||||||
if not is_imggen and not is_transcribe and api_format!=5:
|
if not is_imggen and not is_transcribe and not is_tts and api_format!=5:
|
||||||
if not self.secure_endpoint():
|
if not self.secure_endpoint():
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -2680,6 +2766,21 @@ Enter Prompt:<br>
|
||||||
print("Transcribe: The response could not be sent, maybe connection was terminated?")
|
print("Transcribe: The response could not be sent, maybe connection was terminated?")
|
||||||
time.sleep(0.2) #short delay
|
time.sleep(0.2) #short delay
|
||||||
return
|
return
|
||||||
|
elif is_tts:
|
||||||
|
try:
|
||||||
|
gen = tts_generate(genparams)
|
||||||
|
wav_data = b''
|
||||||
|
if gen:
|
||||||
|
wav_data = base64.b64decode(gen) # Decode the Base64 string into binary data
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header('content-length', str(len(wav_data))) # Set content length
|
||||||
|
self.end_headers(content_type='audio/wav')
|
||||||
|
self.wfile.write(wav_data) # Write the binary WAV data to the response
|
||||||
|
except Exception as ex:
|
||||||
|
utfprint(ex,0)
|
||||||
|
print("TTS: The response could not be sent, maybe connection was terminated?")
|
||||||
|
time.sleep(0.2) #short delay
|
||||||
|
return
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
|
@ -2806,7 +2907,7 @@ def show_gui():
|
||||||
if dlfile:
|
if dlfile:
|
||||||
args.model_param = dlfile
|
args.model_param = dlfile
|
||||||
load_config_cli(args.model_param)
|
load_config_cli(args.model_param)
|
||||||
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
|
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
|
||||||
global exitcounter
|
global exitcounter
|
||||||
exitcounter = 999
|
exitcounter = 999
|
||||||
exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
|
exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
|
||||||
|
@ -3008,6 +3109,9 @@ def show_gui():
|
||||||
sd_quant_var = ctk.IntVar(value=0)
|
sd_quant_var = ctk.IntVar(value=0)
|
||||||
|
|
||||||
whisper_model_var = ctk.StringVar()
|
whisper_model_var = ctk.StringVar()
|
||||||
|
tts_model_var = ctk.StringVar()
|
||||||
|
wavtokenizer_var = ctk.StringVar()
|
||||||
|
ttsgpu_var = ctk.IntVar(value=0)
|
||||||
|
|
||||||
def tabbuttonaction(name):
|
def tabbuttonaction(name):
|
||||||
for t in tabcontent:
|
for t in tabcontent:
|
||||||
|
@ -3158,7 +3262,8 @@ def show_gui():
|
||||||
whisperfilepath = whisper_model_var.get()
|
whisperfilepath = whisper_model_var.get()
|
||||||
mmprojfilepath = mmproj_var.get()
|
mmprojfilepath = mmproj_var.get()
|
||||||
draftmodelpath = draftmodel_var.get()
|
draftmodelpath = draftmodel_var.get()
|
||||||
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath)
|
ttsmodelpath = tts_model_var.get() if ttsgpu_var.get()==1 else ""
|
||||||
|
extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,draftmodelpath,ttsmodelpath)
|
||||||
changed_gpulayers_estimate()
|
changed_gpulayers_estimate()
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -3575,8 +3680,14 @@ def show_gui():
|
||||||
|
|
||||||
# audio tab
|
# audio tab
|
||||||
audio_tab = tabcontent["Audio"]
|
audio_tab = tabcontent["Audio"]
|
||||||
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded.")
|
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
||||||
whisper_model_var.trace("w", gui_changed_modelfile)
|
whisper_model_var.trace("w", gui_changed_modelfile)
|
||||||
|
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
||||||
|
tts_model_var.trace("w", gui_changed_modelfile)
|
||||||
|
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
||||||
|
wavtokenizer_var.trace("w", gui_changed_modelfile)
|
||||||
|
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
|
||||||
|
ttsgpu_var.trace("w", gui_changed_modelfile)
|
||||||
|
|
||||||
def kcpp_export_template():
|
def kcpp_export_template():
|
||||||
nonlocal kcpp_exporting_template
|
nonlocal kcpp_exporting_template
|
||||||
|
@ -3625,7 +3736,7 @@ def show_gui():
|
||||||
|
|
||||||
# launch
|
# launch
|
||||||
def guilaunch():
|
def guilaunch():
|
||||||
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1:
|
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and nomodel.get()!=1:
|
||||||
tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
|
tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
|
||||||
model_var.set(tmp)
|
model_var.set(tmp)
|
||||||
nonlocal nextstate
|
nonlocal nextstate
|
||||||
|
@ -3792,6 +3903,11 @@ def show_gui():
|
||||||
if whisper_model_var.get() != "":
|
if whisper_model_var.get() != "":
|
||||||
args.whispermodel = whisper_model_var.get()
|
args.whispermodel = whisper_model_var.get()
|
||||||
|
|
||||||
|
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
|
||||||
|
args.ttsmodel = tts_model_var.get()
|
||||||
|
args.ttswavtokenizer = wavtokenizer_var.get()
|
||||||
|
args.ttsgpu = (ttsgpu_var.get()==1)
|
||||||
|
|
||||||
def import_vars(dict):
|
def import_vars(dict):
|
||||||
global importvars_in_progress
|
global importvars_in_progress
|
||||||
importvars_in_progress = True
|
importvars_in_progress = True
|
||||||
|
@ -3952,6 +4068,10 @@ def show_gui():
|
||||||
|
|
||||||
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
|
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
|
||||||
|
|
||||||
|
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
||||||
|
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
||||||
|
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
||||||
|
|
||||||
importvars_in_progress = False
|
importvars_in_progress = False
|
||||||
gui_changed_modelfile()
|
gui_changed_modelfile()
|
||||||
if "istemplate" in dict and dict["istemplate"]:
|
if "istemplate" in dict and dict["istemplate"]:
|
||||||
|
@ -4022,7 +4142,7 @@ def show_gui():
|
||||||
kcpp_exporting_template = False
|
kcpp_exporting_template = False
|
||||||
export_vars()
|
export_vars()
|
||||||
|
|
||||||
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel:
|
if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
|
||||||
exitcounter = 999
|
exitcounter = 999
|
||||||
print("")
|
print("")
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
@ -4566,7 +4686,7 @@ def analyze_gguf_model_wrapper(filename=""):
|
||||||
|
|
||||||
def main(launch_args,start_server=True):
|
def main(launch_args,start_server=True):
|
||||||
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
|
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
|
||||||
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
|
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
|
||||||
|
|
||||||
args = launch_args
|
args = launch_args
|
||||||
if (args.version) and len(sys.argv) <= 2:
|
if (args.version) and len(sys.argv) <= 2:
|
||||||
|
@ -4629,7 +4749,7 @@ def main(launch_args,start_server=True):
|
||||||
if not args.model_param:
|
if not args.model_param:
|
||||||
args.model_param = args.model
|
args.model_param = args.model
|
||||||
|
|
||||||
if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.nomodel):
|
if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel):
|
||||||
#give them a chance to pick a file
|
#give them a chance to pick a file
|
||||||
print("For command line arguments, please refer to --help")
|
print("For command line arguments, please refer to --help")
|
||||||
print("***")
|
print("***")
|
||||||
|
@ -4753,6 +4873,14 @@ def main(launch_args,start_server=True):
|
||||||
dlfile = download_model_from_url(args.draftmodel,[".gguf"])
|
dlfile = download_model_from_url(args.draftmodel,[".gguf"])
|
||||||
if dlfile:
|
if dlfile:
|
||||||
args.draftmodel = dlfile
|
args.draftmodel = dlfile
|
||||||
|
if args.ttsmodel and args.ttsmodel!="":
|
||||||
|
dlfile = download_model_from_url(args.ttsmodel,[".gguf"])
|
||||||
|
if dlfile:
|
||||||
|
args.ttsmodel = dlfile
|
||||||
|
if args.ttswavtokenizer and args.ttswavtokenizer!="":
|
||||||
|
dlfile = download_model_from_url(args.ttswavtokenizer,[".gguf"])
|
||||||
|
if dlfile:
|
||||||
|
args.ttswavtokenizer = dlfile
|
||||||
|
|
||||||
# sanitize and replace the default vanity name. remember me....
|
# sanitize and replace the default vanity name. remember me....
|
||||||
if args.model_param and args.model_param!="":
|
if args.model_param and args.model_param!="":
|
||||||
|
@ -4830,7 +4958,7 @@ def main(launch_args,start_server=True):
|
||||||
pass
|
pass
|
||||||
if args.gpulayers==-1:
|
if args.gpulayers==-1:
|
||||||
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
|
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
|
||||||
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel)
|
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "")
|
||||||
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
|
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize)
|
||||||
print(f"Auto Recommended GPU Layers: {layeramt}")
|
print(f"Auto Recommended GPU Layers: {layeramt}")
|
||||||
args.gpulayers = layeramt
|
args.gpulayers = layeramt
|
||||||
|
@ -4999,6 +5127,27 @@ def main(launch_args,start_server=True):
|
||||||
exitcounter = 999
|
exitcounter = 999
|
||||||
exit_with_error(3,"Could not load whisper model: " + whispermodel)
|
exit_with_error(3,"Could not load whisper model: " + whispermodel)
|
||||||
|
|
||||||
|
#handle tts model
|
||||||
|
if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="":
|
||||||
|
if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer):
|
||||||
|
if args.ignoremissing:
|
||||||
|
print("Ignoring missing TTS model files!")
|
||||||
|
args.ttsmodel = None
|
||||||
|
args.ttswavtokenizer = None
|
||||||
|
else:
|
||||||
|
exitcounter = 999
|
||||||
|
exit_with_error(2,f"Cannot find tts model files: {args.ttsmodel} or {args.ttswavtokenizer}")
|
||||||
|
else:
|
||||||
|
ttsmodelpath = args.ttsmodel
|
||||||
|
ttsmodelpath = os.path.abspath(ttsmodelpath)
|
||||||
|
wavtokpath = args.ttswavtokenizer
|
||||||
|
wavtokpath = os.path.abspath(wavtokpath)
|
||||||
|
loadok = tts_load_model(ttsmodelpath,wavtokpath)
|
||||||
|
print("Load TTS Model OK: " + str(loadok))
|
||||||
|
if not loadok:
|
||||||
|
exitcounter = 999
|
||||||
|
exit_with_error(3,"Could not load TTS model!")
|
||||||
|
|
||||||
|
|
||||||
#load embedded lite
|
#load embedded lite
|
||||||
try:
|
try:
|
||||||
|
@ -5296,7 +5445,12 @@ if __name__ == '__main__':
|
||||||
sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true')
|
sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true')
|
||||||
|
|
||||||
whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
|
whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
|
||||||
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="")
|
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
|
||||||
|
|
||||||
|
ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
|
||||||
|
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
||||||
|
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
||||||
|
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
||||||
|
|
||||||
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
||||||
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
|
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
|
||||||
|
|
|
@ -105,6 +105,9 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs);
|
||||||
bool whispertype_load_model(const whisper_load_model_inputs inputs);
|
bool whispertype_load_model(const whisper_load_model_inputs inputs);
|
||||||
whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs);
|
whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs);
|
||||||
|
|
||||||
|
bool ttstype_load_model(const tts_load_model_inputs inputs);
|
||||||
|
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs);
|
||||||
|
|
||||||
void timer_start();
|
void timer_start();
|
||||||
double timer_check();
|
double timer_check();
|
||||||
void print_tok_vec(std::vector<int> &embd);
|
void print_tok_vec(std::vector<int> &embd);
|
||||||
|
|
|
@ -188,13 +188,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO: support for clang
|
// TODO: support for clang
|
||||||
#ifdef __GNUC__
|
|
||||||
# define GGML_V3_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
# define GGML_V3_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
|
||||||
#else
|
|
||||||
# define GGML_V3_DEPRECATED(func, hint) func
|
# define GGML_V3_DEPRECATED(func, hint) func
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __GNUC__
|
#ifndef __GNUC__
|
||||||
# define GGML_V3_ATTRIBUTE_FORMAT(...)
|
# define GGML_V3_ATTRIBUTE_FORMAT(...)
|
||||||
|
|
|
@ -436,19 +436,23 @@ struct llama_v2_file_loader {
|
||||||
uint32_t magic = file.read_u32();
|
uint32_t magic = file.read_u32();
|
||||||
uint32_t version = 0;
|
uint32_t version = 0;
|
||||||
|
|
||||||
if (magic != 'ggml') {
|
uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
|
||||||
|
uint32_t magic_ggmf = 0x67676d66u; // 'ggmf'
|
||||||
|
uint32_t magic_ggml = 0x67676d6cu; // 'ggml'
|
||||||
|
|
||||||
|
if (magic != magic_ggml) {
|
||||||
version = file.read_u32();
|
version = file.read_u32();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (magic == 'ggml' && version == 0) {
|
if (magic == magic_ggml && version == 0) {
|
||||||
file_version = LLAMA_V2_FILE_VERSION_GGML;
|
file_version = LLAMA_V2_FILE_VERSION_GGML;
|
||||||
} else if (magic == 'ggmf' && version == 1) {
|
} else if (magic == magic_ggmf && version == 1) {
|
||||||
file_version = LLAMA_V2_FILE_VERSION_GGMF_V1;
|
file_version = LLAMA_V2_FILE_VERSION_GGMF_V1;
|
||||||
} else if (magic == 'ggjt' && version == 1) {
|
} else if (magic == magic_ggjt && version == 1) {
|
||||||
file_version = LLAMA_V2_FILE_VERSION_GGJT_V1;
|
file_version = LLAMA_V2_FILE_VERSION_GGJT_V1;
|
||||||
} else if (magic == 'ggjt' && version == 2) {
|
} else if (magic == magic_ggjt && version == 2) {
|
||||||
file_version = LLAMA_V2_FILE_VERSION_GGJT_V2;
|
file_version = LLAMA_V2_FILE_VERSION_GGJT_V2;
|
||||||
} else if (magic == 'ggjt' && version == 3) {
|
} else if (magic == magic_ggjt && version == 3) {
|
||||||
file_version = LLAMA_V2_FILE_VERSION_GGJT_V3;
|
file_version = LLAMA_V2_FILE_VERSION_GGJT_V3;
|
||||||
} else {
|
} else {
|
||||||
throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||||
|
@ -553,7 +557,8 @@ struct llama_v2_file_saver {
|
||||||
write_vocab();
|
write_vocab();
|
||||||
}
|
}
|
||||||
void write_magic() {
|
void write_magic() {
|
||||||
file.write_u32(LLAMA_V2_FILE_MAGIC); // magic
|
uint32_t magic_ggjt = 0x67676a74u; // 'ggjt'
|
||||||
|
file.write_u32(magic_ggjt); // magic
|
||||||
file.write_u32(LLAMA_V2_FILE_VERSION); // version
|
file.write_u32(LLAMA_V2_FILE_VERSION); // version
|
||||||
}
|
}
|
||||||
void write_hparams(enum llama_v2_ftype new_ftype) {
|
void write_hparams(enum llama_v2_ftype new_ftype) {
|
||||||
|
@ -2308,7 +2313,8 @@ int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const
|
||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
fin.read((char *) &magic, sizeof(magic));
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 'ggla') {
|
uint32_t magic_ggla = 0x67676c61u; // 'ggla'
|
||||||
|
if (magic != magic_ggla) {
|
||||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -2800,85 +2806,6 @@ size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * sr
|
||||||
return nread;
|
return nread;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
|
||||||
llama_v2_file file(path_session, "rb");
|
|
||||||
|
|
||||||
// sanity checks
|
|
||||||
{
|
|
||||||
const uint32_t magic = file.read_u32();
|
|
||||||
const uint32_t version = file.read_u32();
|
|
||||||
|
|
||||||
if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) {
|
|
||||||
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_v2_hparams session_hparams;
|
|
||||||
file.read_raw(&session_hparams, sizeof(llama_v2_hparams));
|
|
||||||
|
|
||||||
if (session_hparams != ctx->model.hparams) {
|
|
||||||
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// load the prompt
|
|
||||||
{
|
|
||||||
const uint32_t n_token_count = file.read_u32();
|
|
||||||
|
|
||||||
if (n_token_count > n_token_capacity) {
|
|
||||||
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count);
|
|
||||||
*n_token_count_out = n_token_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
// restore the context state
|
|
||||||
{
|
|
||||||
const size_t n_state_size_cur = file.size - file.tell();
|
|
||||||
const size_t n_state_size_max = llama_v2_get_state_size(ctx);
|
|
||||||
|
|
||||||
if (n_state_size_cur > n_state_size_max) {
|
|
||||||
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<uint8_t> state_data(n_state_size_max);
|
|
||||||
file.read_raw(state_data.data(), n_state_size_cur);
|
|
||||||
|
|
||||||
llama_v2_set_state_data(ctx, state_data.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) {
|
|
||||||
llama_v2_file file(path_session, "wb");
|
|
||||||
|
|
||||||
file.write_u32(LLAMA_V2_SESSION_MAGIC);
|
|
||||||
file.write_u32(LLAMA_V2_SESSION_VERSION);
|
|
||||||
|
|
||||||
file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams));
|
|
||||||
|
|
||||||
// save the prompt
|
|
||||||
file.write_u32((uint32_t) n_token_count);
|
|
||||||
file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count);
|
|
||||||
|
|
||||||
// save the context state
|
|
||||||
{
|
|
||||||
const size_t n_state_size_max = llama_v2_get_state_size(ctx);
|
|
||||||
|
|
||||||
std::vector<uint8_t> state_data(n_state_size_max);
|
|
||||||
const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data());
|
|
||||||
|
|
||||||
file.write_raw(state_data.data(), n_state_size_cur);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int llama_v2_eval(
|
int llama_v2_eval(
|
||||||
struct llama_v2_context * ctx,
|
struct llama_v2_context * ctx,
|
||||||
const llama_v2_token * tokens,
|
const llama_v2_token * tokens,
|
||||||
|
|
|
@ -140,10 +140,6 @@ extern "C" {
|
||||||
// Returns the number of bytes read
|
// Returns the number of bytes read
|
||||||
LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src);
|
LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src);
|
||||||
|
|
||||||
// Save/load session file
|
|
||||||
LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
|
||||||
LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count);
|
|
||||||
|
|
||||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||||
// tokens + n_tokens is the provided batch of new tokens to process
|
// tokens + n_tokens is the provided batch of new tokens to process
|
||||||
// n_past is the number of tokens to use from previous eval calls
|
// n_past is the number of tokens to use from previous eval calls
|
||||||
|
@ -167,7 +163,7 @@ extern "C" {
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
|
|
||||||
std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos);
|
std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos);
|
||||||
|
|
||||||
LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx);
|
LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx);
|
||||||
|
|
|
@ -126,7 +126,7 @@ struct rwkv_v2_model {
|
||||||
|
|
||||||
// Finds model parameter by key and sets it into dest.
|
// Finds model parameter by key and sets it into dest.
|
||||||
// If the parameter was not found, returns false.
|
// If the parameter was not found, returns false.
|
||||||
bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, char * key, struct ggml_v2_tensor ** dest) {
|
bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, const char * key, struct ggml_v2_tensor ** dest) {
|
||||||
struct ggml_v2_tensor * parameter = (*parameters)[key];
|
struct ggml_v2_tensor * parameter = (*parameters)[key];
|
||||||
RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key);
|
RWKV_V2_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key);
|
||||||
*dest = parameter;
|
*dest = parameter;
|
||||||
|
@ -135,7 +135,7 @@ bool rwkv_v2_set_parameter(std::unordered_map<std::string, struct ggml_v2_tensor
|
||||||
|
|
||||||
// Finds block parameter by block index and key and sets it into dest.
|
// Finds block parameter by block index and key and sets it into dest.
|
||||||
// If the parameter was not found, returns false.
|
// If the parameter was not found, returns false.
|
||||||
bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) {
|
bool rwkv_v2_set_block_parameter(std::unordered_map<std::string, struct ggml_v2_tensor *> * parameters, int32_t block_index, const char * key, struct ggml_v2_tensor ** dest) {
|
||||||
char full_key[128];
|
char full_key[128];
|
||||||
sprintf(full_key, "blocks.%d.%s", block_index, key);
|
sprintf(full_key, "blocks.%d.%s", block_index, key);
|
||||||
return rwkv_v2_set_parameter(parameters, full_key, dest);
|
return rwkv_v2_set_parameter(parameters, full_key, dest);
|
||||||
|
|
|
@ -112,28 +112,6 @@ static sd_ctx_t * sd_ctx = nullptr;
|
||||||
static int sddebugmode = 0;
|
static int sddebugmode = 0;
|
||||||
static std::string recent_data = "";
|
static std::string recent_data = "";
|
||||||
|
|
||||||
std::string base64_encode(const unsigned char* data, unsigned int data_length) {
|
|
||||||
const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
||||||
std::string encoded;
|
|
||||||
encoded.reserve(((data_length + 2) / 3) * 4);
|
|
||||||
for (unsigned int i = 0; i < data_length; i += 3) {
|
|
||||||
unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
|
|
||||||
encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
|
|
||||||
encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
|
|
||||||
if (i + 1 < data_length) {
|
|
||||||
encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
|
|
||||||
} else {
|
|
||||||
encoded.push_back('=');
|
|
||||||
}
|
|
||||||
if (i + 2 < data_length) {
|
|
||||||
encoded.push_back(base64_chars[triple & 0x3F]);
|
|
||||||
} else {
|
|
||||||
encoded.push_back('=');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return encoded;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
|
static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
|
||||||
static bool notiling = false;
|
static bool notiling = false;
|
||||||
bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
||||||
|
@ -553,7 +531,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
||||||
unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, "");
|
unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, "");
|
||||||
if (png != NULL)
|
if (png != NULL)
|
||||||
{
|
{
|
||||||
recent_data = base64_encode(png,out_data_len);
|
recent_data = kcpp_base64_encode(png,out_data_len);
|
||||||
free(png);
|
free(png);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
672
otherarch/tts_adapter.cpp
Normal file
672
otherarch/tts_adapter.cpp
Normal file
|
@ -0,0 +1,672 @@
|
||||||
|
#include "model_adapter.h"
|
||||||
|
#include "otherarch/utils.h"
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <regex>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "src/llama-context.h"
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct wav_header {
|
||||||
|
char riff[4] = {'R', 'I', 'F', 'F'};
|
||||||
|
uint32_t chunk_size;
|
||||||
|
char wave[4] = {'W', 'A', 'V', 'E'};
|
||||||
|
char fmt[4] = {'f', 'm', 't', ' '};
|
||||||
|
uint32_t fmt_chunk_size = 16;
|
||||||
|
uint16_t audio_format = 1; // PCM
|
||||||
|
uint16_t num_channels = 1; // Mono
|
||||||
|
uint32_t sample_rate;
|
||||||
|
uint32_t byte_rate;
|
||||||
|
uint16_t block_align;
|
||||||
|
uint16_t bits_per_sample = 16;
|
||||||
|
char data[4] = {'d', 'a', 't', 'a'};
|
||||||
|
uint32_t data_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
|
||||||
|
std::ostringstream oss;
|
||||||
|
wav_header header;
|
||||||
|
|
||||||
|
// Fill header fields
|
||||||
|
header.sample_rate = sample_rate;
|
||||||
|
header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
|
||||||
|
header.block_align = header.num_channels * (header.bits_per_sample / 8);
|
||||||
|
header.data_size = data.size() * (header.bits_per_sample / 8);
|
||||||
|
header.chunk_size = 36 + header.data_size;
|
||||||
|
|
||||||
|
// Write header
|
||||||
|
oss.write(reinterpret_cast<const char*>(&header), sizeof(header));
|
||||||
|
|
||||||
|
// Write samples
|
||||||
|
for (const auto &sample : data) {
|
||||||
|
int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
|
||||||
|
oss.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get binary WAV data
|
||||||
|
std::string wav_data = oss.str();
|
||||||
|
return kcpp_base64_encode(wav_data); //return as base64 string
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fill_hann_window(int length, bool periodic, float * output) {
|
||||||
|
int offset = -1;
|
||||||
|
if (periodic) {
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// very poor-man fft
|
||||||
|
static void twiddle(float * real, float * imag, int k, int N) {
|
||||||
|
float angle = 2 * M_PI * k / N;
|
||||||
|
*real = cos(angle);
|
||||||
|
*imag = sin(angle);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void irfft(int n, const float * inp_cplx, float * out_real) {
|
||||||
|
int N = n / 2 + 1;
|
||||||
|
|
||||||
|
std::vector<float> real_input(N);
|
||||||
|
std::vector<float> imag_input(N);
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
real_input[i] = inp_cplx[2 * i];
|
||||||
|
imag_input[i] = inp_cplx[2 * i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> real_output(n);
|
||||||
|
std::vector<float> imag_output(n);
|
||||||
|
|
||||||
|
for (int k = 0; k < n; ++k) {
|
||||||
|
real_output[k] = 0.0f;
|
||||||
|
imag_output[k] = 0.0f;
|
||||||
|
for (int m = 0; m < N; ++m) {
|
||||||
|
float twiddle_real;
|
||||||
|
float twiddle_imag;
|
||||||
|
|
||||||
|
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
|
||||||
|
|
||||||
|
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
|
||||||
|
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
out_real[i] = real_output[i] / N;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
|
||||||
|
int64_t output_height = n_out;
|
||||||
|
int64_t kernel_w = n_win;
|
||||||
|
int64_t stride_w = n_hop;
|
||||||
|
int64_t width = n_out;
|
||||||
|
|
||||||
|
output.resize(width, 0.0f);
|
||||||
|
|
||||||
|
int64_t col_idx = 0;
|
||||||
|
for (int64_t w_col = 0; w_col < width; ++w_col) {
|
||||||
|
int64_t start = w_col * stride_w - n_pad;
|
||||||
|
int64_t end = start + kernel_w;
|
||||||
|
|
||||||
|
for (int64_t w_im = start; w_im < end; ++w_im) {
|
||||||
|
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
|
||||||
|
output[w_im] += data[col_idx];
|
||||||
|
}
|
||||||
|
col_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output.resize(n_out - 2 * n_pad);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: not optimized at all
|
||||||
|
static std::vector<float> embd_to_audio(
|
||||||
|
const float * embd,
|
||||||
|
const int n_codes,
|
||||||
|
const int n_embd,
|
||||||
|
const int n_thread) {
|
||||||
|
const int n_fft = 1280;
|
||||||
|
const int n_hop = 320;
|
||||||
|
const int n_win = 1280;
|
||||||
|
const int n_pad = (n_win - n_hop)/2;
|
||||||
|
const int n_out = (n_codes - 1)*n_hop + n_win;
|
||||||
|
|
||||||
|
std::vector<float> hann(n_fft);
|
||||||
|
|
||||||
|
fill_hann_window(hann.size(), true, hann.data());
|
||||||
|
|
||||||
|
int n_spec = n_embd*n_codes;
|
||||||
|
|
||||||
|
std::vector<float> E (n_spec);
|
||||||
|
std::vector<float> S (n_spec);
|
||||||
|
std::vector<float> ST(n_spec);
|
||||||
|
|
||||||
|
for (int l = 0; l < n_codes; ++l) {
|
||||||
|
for (int k = 0; k < n_embd; ++k) {
|
||||||
|
E[k*n_codes + l] = embd[l*n_embd + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int k = 0; k < n_embd/2; ++k) {
|
||||||
|
for (int l = 0; l < n_codes; ++l) {
|
||||||
|
float mag = E[(k )*n_codes + l];
|
||||||
|
float phi = E[(k + n_embd/2)*n_codes + l];
|
||||||
|
|
||||||
|
mag = exp(mag);
|
||||||
|
|
||||||
|
if (mag > 1e2) {
|
||||||
|
mag = 1e2;
|
||||||
|
}
|
||||||
|
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
|
||||||
|
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int l = 0; l < n_codes; ++l) {
|
||||||
|
for (int k = 0; k < n_embd/2; ++k) {
|
||||||
|
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
|
||||||
|
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> res (n_codes*n_fft);
|
||||||
|
std::vector<float> hann2(n_codes*n_fft);
|
||||||
|
|
||||||
|
std::vector<std::thread> workers(n_thread);
|
||||||
|
for (int i = 0; i < n_thread; ++i) {
|
||||||
|
workers[i] = std::thread([&, i]() {
|
||||||
|
for (int l = i; l < n_codes; l += n_thread) {
|
||||||
|
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
|
||||||
|
for (int j = 0; j < n_fft; ++j) {
|
||||||
|
res [l*n_fft + j] *= hann[j];
|
||||||
|
hann2[l*n_fft + j] = hann[j] * hann[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n_thread; ++i) {
|
||||||
|
workers[i].join();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> audio;
|
||||||
|
std::vector<float> env;
|
||||||
|
|
||||||
|
fold(res, n_out, n_win, n_hop, n_pad, audio);
|
||||||
|
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
|
||||||
|
|
||||||
|
for (size_t i = 0; i < audio.size(); ++i) {
|
||||||
|
audio[i] /= env[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return audio;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const std::map<int, std::string> ones = {
|
||||||
|
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
|
||||||
|
{5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
|
||||||
|
{10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
|
||||||
|
{15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::map<int, std::string> tens = {
|
||||||
|
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
|
||||||
|
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Convert a number less than 1000 to words
|
||||||
|
static std::string convert_less_than_thousand(int num) {
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (num >= 100) {
|
||||||
|
result += ones.at(num / 100) + " hundred ";
|
||||||
|
num %= 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num >= 20) {
|
||||||
|
result += tens.at(num / 10);
|
||||||
|
if (num % 10 > 0) {
|
||||||
|
result += "-" + ones.at(num % 10);
|
||||||
|
}
|
||||||
|
} else if (num > 0) {
|
||||||
|
result += ones.at(num);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string number_to_words(const std::string & number_str) {
|
||||||
|
try {
|
||||||
|
size_t decimal_pos = number_str.find('.');
|
||||||
|
std::string integer_part = number_str.substr(0, decimal_pos);
|
||||||
|
|
||||||
|
int int_number = std::stoi(integer_part);
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (int_number == 0) {
|
||||||
|
result = "zero";
|
||||||
|
} else {
|
||||||
|
if (int_number >= 1000000000) {
|
||||||
|
int billions = int_number / 1000000000;
|
||||||
|
result += convert_less_than_thousand(billions) + " billion ";
|
||||||
|
int_number %= 1000000000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int_number >= 1000000) {
|
||||||
|
int millions = int_number / 1000000;
|
||||||
|
result += convert_less_than_thousand(millions) + " million ";
|
||||||
|
int_number %= 1000000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int_number >= 1000) {
|
||||||
|
int thousands = int_number / 1000;
|
||||||
|
result += convert_less_than_thousand(thousands) + " thousand ";
|
||||||
|
int_number %= 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int_number > 0) {
|
||||||
|
result += convert_less_than_thousand(int_number);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle decimal part
|
||||||
|
if (decimal_pos != std::string::npos) {
|
||||||
|
result += " point";
|
||||||
|
std::string decimal_part = number_str.substr(decimal_pos + 1);
|
||||||
|
for (char digit : decimal_part) {
|
||||||
|
result += " " + ones.at(digit - '0');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
// Skip if fails
|
||||||
|
return " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string replace_numbers_with_words(const std::string & input_text) {
|
||||||
|
std::regex number_pattern(R"(\d+(\.\d+)?)");
|
||||||
|
std::string result;
|
||||||
|
auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
|
||||||
|
auto end = std::sregex_iterator();
|
||||||
|
|
||||||
|
size_t last_pos = 0;
|
||||||
|
for (std::sregex_iterator i = it; i != end; ++i) {
|
||||||
|
const std::smatch& match = *i;
|
||||||
|
result.append(input_text, last_pos, match.position() - last_pos);
|
||||||
|
result.append(number_to_words(match.str()));
|
||||||
|
last_pos = match.position() + match.length();
|
||||||
|
}
|
||||||
|
result.append(input_text, last_pos);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string process_text(const std::string & text) {
|
||||||
|
|
||||||
|
std::string processed_text = replace_numbers_with_words(text);
|
||||||
|
|
||||||
|
std::transform(processed_text.begin(), processed_text.end(),
|
||||||
|
processed_text.begin(), ::tolower);
|
||||||
|
|
||||||
|
std::regex special_chars(R"([-_/,\.\\])");
|
||||||
|
processed_text = std::regex_replace(processed_text, special_chars, " ");
|
||||||
|
std::regex non_alpha(R"([^a-z\s])");
|
||||||
|
processed_text = std::regex_replace(processed_text, non_alpha, "");
|
||||||
|
std::regex multiple_spaces(R"(\s+)");
|
||||||
|
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
|
||||||
|
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
|
||||||
|
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
|
||||||
|
|
||||||
|
return processed_text;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
|
||||||
|
prompt.insert(prompt.end(), tokens.begin(), tokens.end());
|
||||||
|
}
|
||||||
|
static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
|
||||||
|
auto tmp = common_tokenize(model, txt, add_special, parse_special);
|
||||||
|
prompt_add(prompt, tmp);
|
||||||
|
}
|
||||||
|
static void prompt_init(llama_tokens & prompt, const llama_model * model) {
|
||||||
|
prompt.clear();
|
||||||
|
prompt_add(prompt, model, "<|im_start|>\n", true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<llama_token> prepare_guide_tokens(const llama_model * model, const std::string& str)
|
||||||
|
{
|
||||||
|
const std::string& delimiter = "<|text_sep|>";
|
||||||
|
|
||||||
|
std::vector<llama_token> result;
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = str.find(delimiter);
|
||||||
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
std::string current_word = str.substr(start, end - start);
|
||||||
|
auto tmp = common_tokenize(model, current_word, false, true);
|
||||||
|
result.push_back(tmp[0]);
|
||||||
|
start = end + delimiter.length();
|
||||||
|
end = str.find(delimiter, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the last part
|
||||||
|
std::string current_word = str.substr(start);
|
||||||
|
auto tmp = common_tokenize(model, current_word, false, true);
|
||||||
|
result.push_back(tmp[0]);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static llama_context * ttc_ctx = nullptr; //text to codes ctx
|
||||||
|
static llama_context * cts_ctx = nullptr; //codes to speech
|
||||||
|
|
||||||
|
static int ttsdebugmode = 0;
|
||||||
|
static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
|
||||||
|
static std::string last_generated_audio = "";
|
||||||
|
|
||||||
|
bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
|
{
|
||||||
|
//duplicated from expose.cpp
|
||||||
|
int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices
|
||||||
|
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
|
||||||
|
putenv((char*)usingclblast.c_str());
|
||||||
|
cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
|
||||||
|
int platform = cl_parseinfo/10;
|
||||||
|
int devices = cl_parseinfo%10;
|
||||||
|
ttsplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
|
||||||
|
ttsdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
|
||||||
|
putenv((char*)ttsplatformenv.c_str());
|
||||||
|
putenv((char*)ttsdeviceenv.c_str());
|
||||||
|
std::string vulkan_info_raw = inputs.vulkan_info;
|
||||||
|
std::string vulkan_info_str = "";
|
||||||
|
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
|
||||||
|
vulkan_info_str += vulkan_info_raw[i];
|
||||||
|
if (i < vulkan_info_raw.length() - 1) {
|
||||||
|
vulkan_info_str += ",";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(vulkan_info_str!="")
|
||||||
|
{
|
||||||
|
ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
|
||||||
|
putenv((char*)ttsvulkandeviceenv.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
|
||||||
|
std::string modelfile_ttc = inputs.ttc_model_filename;
|
||||||
|
std::string modelfile_cts = inputs.cts_model_filename;
|
||||||
|
printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
|
||||||
|
|
||||||
|
ttsdebugmode = inputs.debugmode;
|
||||||
|
|
||||||
|
// tts init
|
||||||
|
llama_model_params tts_model_params = llama_model_default_params();
|
||||||
|
llama_context_params tts_ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
|
const int nthreads = 4;
|
||||||
|
|
||||||
|
tts_model_params.use_mmap = false;
|
||||||
|
tts_model_params.use_mlock = false;
|
||||||
|
tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
|
||||||
|
tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
||||||
|
tts_ctx_params.n_ctx = 8192;
|
||||||
|
tts_ctx_params.logits_all = false;
|
||||||
|
tts_ctx_params.offload_kqv = true;
|
||||||
|
tts_ctx_params.n_batch = 8192;
|
||||||
|
tts_ctx_params.n_ubatch = 512;
|
||||||
|
tts_ctx_params.n_threads = nthreads;
|
||||||
|
tts_ctx_params.n_threads_batch = nthreads;
|
||||||
|
tts_ctx_params.flash_attn = false;
|
||||||
|
|
||||||
|
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
|
||||||
|
ttc_ctx = llama_new_context_with_model(ttcmodel, tts_ctx_params);
|
||||||
|
|
||||||
|
if (ttc_ctx == nullptr) {
|
||||||
|
printf("\nTTS Load Error: Failed to initialize ttc context!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
|
||||||
|
|
||||||
|
tts_ctx_params.embeddings = true; //this requires embeddings instead
|
||||||
|
cts_ctx = llama_new_context_with_model(ctsmodel, tts_ctx_params);
|
||||||
|
|
||||||
|
if (cts_ctx == nullptr) {
|
||||||
|
printf("\nTTS Load Error: Failed to initialize cts context!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> tmp = {1, 2, 3, 4};
|
||||||
|
llama_kv_cache_clear(ttc_ctx);
|
||||||
|
auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||||
|
if(er!=0)
|
||||||
|
{
|
||||||
|
printf("\nTTS Eval returned nonzero: %d\n",er);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\nTTS Load Complete.\n");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
|
{
|
||||||
|
tts_generation_outputs output;
|
||||||
|
|
||||||
|
if(ttc_ctx==nullptr || cts_ctx==nullptr)
|
||||||
|
{
|
||||||
|
printf("\nWarning: KCPP TTS not initialized!\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> codes;
|
||||||
|
std::vector<llama_token> guide_tokens;
|
||||||
|
const llama_model * model_ttc = &(ttc_ctx->model);
|
||||||
|
const llama_model * model_cts = &(cts_ctx->model);
|
||||||
|
const int ttc_n_vocab = llama_n_vocab(model_ttc);
|
||||||
|
std::string prompt = inputs.prompt;
|
||||||
|
|
||||||
|
if(!inputs.quiet)
|
||||||
|
{
|
||||||
|
printf("\nTTS Generating... ");
|
||||||
|
}
|
||||||
|
|
||||||
|
// process prompt and generate voice codes
|
||||||
|
|
||||||
|
std::vector<llama_token> prompt_inp;
|
||||||
|
prompt_init(prompt_inp, model_ttc);
|
||||||
|
prompt_add(prompt_inp, model_ttc, "<|text_start|>", false, true);
|
||||||
|
|
||||||
|
int speaker_seed = inputs.speaker_seed;
|
||||||
|
int audio_seed = inputs.audio_seed;
|
||||||
|
if (speaker_seed <= 0 || speaker_seed==0xFFFFFFFF)
|
||||||
|
{
|
||||||
|
speaker_seed = (((uint32_t)time(NULL)) % 1000000u);
|
||||||
|
if(ttsdebugmode==1)
|
||||||
|
{
|
||||||
|
printf("\nUsing Speaker Seed: %d", speaker_seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
|
||||||
|
{
|
||||||
|
audio_seed = (((uint32_t)time(NULL)) % 1000000u);
|
||||||
|
if(ttsdebugmode==1)
|
||||||
|
{
|
||||||
|
printf("\nUsing Audio Seed: %d", audio_seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::mt19937 tts_rng(audio_seed);
|
||||||
|
std::mt19937 speaker_rng(speaker_seed);
|
||||||
|
|
||||||
|
//add the speaker based on the seed
|
||||||
|
if(speaker_seed>0)
|
||||||
|
{
|
||||||
|
std::string sampletext = "but<|text_sep|>that<|text_sep|>is<|text_sep|>what<|text_sep|>it<|text_sep|>is<|text_sep|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert the input text into the necessary format expected by OuteTTS
|
||||||
|
std::string prompt_clean = process_text(prompt);
|
||||||
|
|
||||||
|
if(prompt_clean.size()==0)
|
||||||
|
{
|
||||||
|
//no input
|
||||||
|
if(!inputs.quiet)
|
||||||
|
{
|
||||||
|
printf("\nTTS sent empty input.\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 1;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!inputs.quiet && ttsdebugmode==1)
|
||||||
|
{
|
||||||
|
printf("\nInput: %s\n", prompt_clean.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean);
|
||||||
|
prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
|
||||||
|
|
||||||
|
if(!inputs.quiet)
|
||||||
|
{
|
||||||
|
printf(" (%d input words)...", guide_tokens.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
|
||||||
|
|
||||||
|
//create batch with tokens for decoding prompt processing
|
||||||
|
llama_kv_cache_clear(ttc_ctx);
|
||||||
|
llama_kv_cache_clear(cts_ctx);
|
||||||
|
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
|
||||||
|
|
||||||
|
auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
|
||||||
|
if (!evalok) {
|
||||||
|
printf("\nError: TTS prompt batch processing failed\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
// main loop
|
||||||
|
int n_decode = 0;
|
||||||
|
int n_predict = 4096; //max 4096 tokens
|
||||||
|
|
||||||
|
bool next_token_uses_guide_token = true;
|
||||||
|
|
||||||
|
while (n_decode <= n_predict)
|
||||||
|
{
|
||||||
|
float * logits = llama_get_logits(ttc_ctx);
|
||||||
|
|
||||||
|
llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,20,1.0,tts_rng);
|
||||||
|
|
||||||
|
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
|
||||||
|
if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id))
|
||||||
|
{
|
||||||
|
llama_token guide_token = guide_tokens[0];
|
||||||
|
guide_tokens.erase(guide_tokens.begin());
|
||||||
|
new_token_id = guide_token; //ensure correct word fragment is used
|
||||||
|
}
|
||||||
|
|
||||||
|
//this is the token id that always precedes a new word
|
||||||
|
next_token_uses_guide_token = (new_token_id == 198);
|
||||||
|
|
||||||
|
codes.push_back(new_token_id);
|
||||||
|
|
||||||
|
// is it an end of generation? -> mark the stream as finished
|
||||||
|
if (llama_token_is_eog(model_ttc, new_token_id) || n_decode >= n_predict) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
n_decode += 1;
|
||||||
|
std::vector<llama_token> next = {new_token_id};
|
||||||
|
llama_batch batch = llama_batch_get_one(next.data(), next.size());
|
||||||
|
|
||||||
|
// evaluate the current batch with the transformer model
|
||||||
|
if (llama_decode(ttc_ctx, batch)) {
|
||||||
|
printf("\nError: TTS code generation failed!\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!inputs.quiet && ttsdebugmode==1)
|
||||||
|
{
|
||||||
|
const std::string inp_txt = common_detokenize(ttc_ctx, codes, true);
|
||||||
|
|
||||||
|
printf("\nGenerated %d Codes: '%s'\n",codes.size(), inp_txt.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove all non-audio tokens (i.e. < 151672 || > 155772)
|
||||||
|
codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
|
||||||
|
|
||||||
|
for (auto & token : codes) {
|
||||||
|
token -= 151672;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_codes = codes.size();
|
||||||
|
if(n_codes<=1)
|
||||||
|
{
|
||||||
|
printf("\nWarning: TTS vocoder generated nothing!\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
|
||||||
|
|
||||||
|
if (llama_decode(cts_ctx, codebatch.batch) != 0) {
|
||||||
|
printf("\nError: TTS vocoder generation failed!\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// spectral operations
|
||||||
|
const int n_embd = llama_n_embd(model_cts);
|
||||||
|
const float * embd = llama_get_embeddings(cts_ctx);
|
||||||
|
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
|
||||||
|
|
||||||
|
const int n_sr = 24000; // sampling rate
|
||||||
|
|
||||||
|
// zero out first 0.05 seconds
|
||||||
|
for (int i = 0; i < 24000/20; ++i) {
|
||||||
|
audio[i] = 0.0f;
|
||||||
|
}
|
||||||
|
//add some silence at the end
|
||||||
|
for (int i = 0; i < 24000/20; ++i) {
|
||||||
|
audio.push_back(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
last_generated_audio = save_wav16_base64(audio, n_sr);
|
||||||
|
|
||||||
|
if(!inputs.quiet)
|
||||||
|
{
|
||||||
|
printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
output.data = last_generated_audio.c_str();
|
||||||
|
output.status = 1;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,6 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
@ -303,6 +304,47 @@ std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string)
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length) {
|
||||||
|
const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||||
|
std::string encoded;
|
||||||
|
encoded.reserve(((data_length + 2) / 3) * 4);
|
||||||
|
for (unsigned int i = 0; i < data_length; i += 3) {
|
||||||
|
unsigned int triple = (data[i] << 16) + (i + 1 < data_length ? data[i + 1] << 8 : 0) + (i + 2 < data_length ? data[i + 2] : 0);
|
||||||
|
encoded.push_back(base64_chars[(triple >> 18) & 0x3F]);
|
||||||
|
encoded.push_back(base64_chars[(triple >> 12) & 0x3F]);
|
||||||
|
if (i + 1 < data_length) {
|
||||||
|
encoded.push_back(base64_chars[(triple >> 6) & 0x3F]);
|
||||||
|
} else {
|
||||||
|
encoded.push_back('=');
|
||||||
|
}
|
||||||
|
if (i + 2 < data_length) {
|
||||||
|
encoded.push_back(base64_chars[triple & 0x3F]);
|
||||||
|
} else {
|
||||||
|
encoded.push_back('=');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return encoded;
|
||||||
|
}
|
||||||
|
std::string kcpp_base64_encode(const std::string &data) {
|
||||||
|
static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||||
|
std::string encoded;
|
||||||
|
int val = 0, valb = -6;
|
||||||
|
for (unsigned char c : data) {
|
||||||
|
val = (val << 8) + c;
|
||||||
|
valb += 8;
|
||||||
|
while (valb >= 0) {
|
||||||
|
encoded.push_back(lookup[(val >> valb) & 0x3F]);
|
||||||
|
valb -= 6;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (valb > -6) {
|
||||||
|
encoded.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]);
|
||||||
|
}
|
||||||
|
while (encoded.size() % 4) {
|
||||||
|
encoded.push_back('=');
|
||||||
|
}
|
||||||
|
return encoded;
|
||||||
|
}
|
||||||
|
|
||||||
std::string get_timestamp_str()
|
std::string get_timestamp_str()
|
||||||
{
|
{
|
||||||
|
@ -314,3 +356,150 @@ std::string get_timestamp_str()
|
||||||
std::string timestamp(buffer);
|
std::string timestamp(buffer);
|
||||||
return timestamp;
|
return timestamp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//a very rudimentary all in one sampling function which has no dependencies
|
||||||
|
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
|
||||||
|
{
|
||||||
|
if (temp <= 0 || top_k==1) {
|
||||||
|
// select the token with the highest logit directly
|
||||||
|
float max_logit = logits[0];
|
||||||
|
int32_t max_id = 0;
|
||||||
|
for (int i = 1; i < n_logits; ++i) {
|
||||||
|
if (logits[i] > max_logit) {
|
||||||
|
max_logit = logits[i];
|
||||||
|
max_id = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
top_k = (top_k<=0 || top_k>300)?300:top_k;
|
||||||
|
top_k = std::min(top_k, n_logits);
|
||||||
|
|
||||||
|
std::vector<std::pair<float, int32_t>> logits_id;
|
||||||
|
logits_id.reserve(n_logits);
|
||||||
|
|
||||||
|
//temperature sample
|
||||||
|
const float scale = 1.0f/temp;
|
||||||
|
for (int i = 0; i < n_logits; ++i) {
|
||||||
|
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
//sample top_k
|
||||||
|
std::partial_sort(
|
||||||
|
logits_id.begin(),
|
||||||
|
logits_id.begin() + top_k, logits_id.end(),
|
||||||
|
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||||
|
return a.first > b.first;
|
||||||
|
});
|
||||||
|
logits_id.resize(top_k);
|
||||||
|
|
||||||
|
// compute probs for the top k tokens
|
||||||
|
std::vector<float> probs;
|
||||||
|
probs.reserve(logits_id.size());
|
||||||
|
float maxl = logits_id[0].first;
|
||||||
|
double sum = 0.0;
|
||||||
|
for (const auto & kv : logits_id) {
|
||||||
|
const float p = expf(kv.first - maxl);
|
||||||
|
probs.push_back(p);
|
||||||
|
sum += p;
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize the probs
|
||||||
|
for (auto & p : probs) {
|
||||||
|
p /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||||
|
int idx = dist(rng);
|
||||||
|
|
||||||
|
return logits_id[idx].second;
|
||||||
|
}
|
||||||
|
|
||||||
|
kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope)
|
||||||
|
{
|
||||||
|
int32_t seq_id = 0;
|
||||||
|
pos.resize(n_tokens * (use_mrope?4:1));
|
||||||
|
std::fill(pos.begin(), pos.end(), 0);
|
||||||
|
n_seq_id.resize(n_tokens);
|
||||||
|
seq_ids.resize(n_tokens + 1);
|
||||||
|
logits.resize(n_tokens);
|
||||||
|
seq_id_0.resize(1);
|
||||||
|
seq_id_0[0] = seq_id;
|
||||||
|
seq_ids [n_tokens] = nullptr;
|
||||||
|
batch = {
|
||||||
|
/*n_tokens =*/ n_tokens,
|
||||||
|
/*tokens =*/ nullptr,
|
||||||
|
/*embd =*/ embd,
|
||||||
|
/*pos =*/ pos.data(),
|
||||||
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
|
/*seq_id =*/ seq_ids.data(),
|
||||||
|
/*logits =*/ logits.data(),
|
||||||
|
};
|
||||||
|
|
||||||
|
if(!use_mrope)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
|
batch.pos [i] = npast + i;
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = false;
|
||||||
|
}
|
||||||
|
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
||||||
|
batch.pos[j] = npast + (j % batch.n_tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
|
||||||
|
{
|
||||||
|
int32_t seq_id = 0;
|
||||||
|
int32_t n_tokens = tokens.size();
|
||||||
|
pos.resize(n_tokens * (use_mrope?4:1));
|
||||||
|
std::fill(pos.begin(), pos.end(), 0);
|
||||||
|
n_seq_id.resize(n_tokens);
|
||||||
|
seq_ids.resize(n_tokens + 1);
|
||||||
|
logits.resize(n_tokens);
|
||||||
|
seq_id_0.resize(1);
|
||||||
|
seq_id_0[0] = seq_id;
|
||||||
|
seq_ids[n_tokens] = nullptr;
|
||||||
|
batch = {
|
||||||
|
/*n_tokens =*/ n_tokens,
|
||||||
|
/*tokens =*/ tokens.data(),
|
||||||
|
/*embd =*/ nullptr,
|
||||||
|
/*pos =*/ pos.data(),
|
||||||
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
|
/*seq_id =*/ seq_ids.data(),
|
||||||
|
/*logits =*/ logits.data(),
|
||||||
|
};
|
||||||
|
|
||||||
|
if(!use_mrope)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
|
batch.pos [i] = npast + i;
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = (return_all_logits?true:false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = (return_all_logits?true:false);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
||||||
|
batch.pos[j] = npast + (j % batch.n_tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
batch.logits[n_tokens - 1] = true;
|
||||||
|
}
|
|
@ -8,6 +8,7 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include "ggml_v3.h"
|
#include "ggml_v3.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
|
@ -52,10 +53,23 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
|
||||||
//
|
//
|
||||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
||||||
|
|
||||||
|
|
||||||
bool should_transpose_layer(std::string name);
|
bool should_transpose_layer(std::string name);
|
||||||
void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);
|
void kcpp_graph_compute_helper(ggml_v3_cgraph * graph, int n_threads);
|
||||||
|
|
||||||
std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
|
std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
|
||||||
|
std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length);
|
||||||
|
std::string kcpp_base64_encode(const std::string &data);
|
||||||
|
|
||||||
std::string get_timestamp_str();
|
std::string get_timestamp_str();
|
||||||
|
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
|
||||||
|
|
||||||
|
struct kcpp_embd_batch { //duplcated from llava_embd_batch
|
||||||
|
std::vector<int32_t> pos;
|
||||||
|
std::vector<int32_t> n_seq_id;
|
||||||
|
std::vector<int32_t> seq_id_0;
|
||||||
|
std::vector<int32_t *> seq_ids;
|
||||||
|
std::vector<int8_t> logits;
|
||||||
|
llama_batch batch;
|
||||||
|
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope);
|
||||||
|
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits);
|
||||||
|
};
|
Loading…
Add table
Add a link
Reference in a new issue