From 13db5aee9e8e74d473cfc60e4a51bd40e7ca46ad Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 22 Feb 2026 23:15:08 +0800 Subject: [PATCH] stub files for loading ace step --- CMakeLists.txt | 11 +- Makefile | 21 +- expose.cpp | 9 + expose.h | 22 ++ koboldcpp.py | 131 +++++++- model_adapter.h | 7 +- otherarch/acestep/ace-qwen3.cpp | 453 +++++++++++++++++++--------- otherarch/acestep/music_adapter.cpp | 85 ++++++ 8 files changed, 571 insertions(+), 168 deletions(-) create mode 100644 otherarch/acestep/music_adapter.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index bff95369a..631b71271 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -481,6 +481,13 @@ target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) +add_library(music_adapter + otherarch/acestep/music_adapter.cpp) +target_include_directories(music_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/acestep ./tools ./common) +target_compile_features(music_adapter PUBLIC cxx_std_17) # don't bump +target_link_libraries(music_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) +set_target_properties(music_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(tts_adapter otherarch/tts_adapter.cpp) target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./otherarch/qwen3tts ./tools ./common) @@ -510,7 +517,7 @@ if (LLAMA_CUBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter embeddings_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter music_adapter tts_adapter embeddings_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) add_custom_command( @@ -530,7 +537,7 @@ if (LLAMA_HIPBLAS) set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter tts_adapter embeddings_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 ggml_v3 common2 gpttype_adapter whisper_adapter music_adapter tts_adapter embeddings_adapter sdtype_adapter ${LLAMA_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) add_custom_command( diff --git a/Makefile b/Makefile index a5640ac25..ffde3b095 100644 --- a/Makefile +++ b/Makefile @@ -698,6 +698,9 @@ tts_default.o: otherarch/tts_adapter.cpp otherarch/ttscpp/src/ttscpp.cpp otherar embeddings_default.o: otherarch/embeddings_adapter.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ +music_default.o: otherarch/acestep/music_adapter.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + # idiotic "for easier compilation" GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampler.cpp src/llama-kv-cache.cpp src/llama-kv-cache-iswa.cpp src/llama-memory-hybrid.cpp src/llama-memory-hybrid-iswa.cpp src/llama-memory-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) @@ -742,8 +745,6 @@ ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp other $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) qwen3tts: otherarch/qwen3tts/q3ttsmain.cpp otherarch/qwen3tts/qwen3_tts.cpp otherarch/qwen3tts/text_tokenizer.cpp otherarch/qwen3tts/gguf_loader.cpp otherarch/qwen3tts/tts_transformer.cpp otherarch/qwen3tts/audio_tokenizer_decoder.cpp otherarch/qwen3tts/audio_tokenizer_encoder.cpp otherarch/qwen3tts/coreml_code_predictor_stub.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -acestep-a: otherarch/acestep/ace-qwen3.cpp otherarch/acestep/request.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) acestep-b: otherarch/acestep/dit-vae.cpp otherarch/acestep/request.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -845,11 +846,11 @@ else endif #generated libraries -koboldcpp_default: ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) +koboldcpp_default: ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o tts_default.o music_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(DEFAULT_BUILD) ifdef FAILSAFE_BUILD -koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FAILSAFE) $(OBJS) +koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o music_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FAILSAFE) $(OBJS) $(FAILSAFE_BUILD) else koboldcpp_failsafe: @@ -857,7 +858,7 @@ koboldcpp_failsafe: endif ifdef NOAVX2_BUILD -koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o tts_default.o music_default.o embeddings_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) $(NOAVX2_BUILD) else koboldcpp_noavx2: @@ -865,7 +866,7 @@ koboldcpp_noavx2: endif ifdef CUBLAS_BUILD -koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o ggml-repack.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o ggml-repack.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) $(CUBLAS_BUILD) else koboldcpp_cublas: @@ -873,7 +874,7 @@ koboldcpp_cublas: endif ifdef HIPBLAS_BUILD -koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o ggml-repack.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o ggml-repack.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) $(HIPBLAS_BUILD) else koboldcpp_hipblas: @@ -881,12 +882,12 @@ koboldcpp_hipblas: endif ifdef VULKAN_BUILD -koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_FULL) $(OBJS) +koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(VULKAN_BUILD) ifdef NOAVX2_BUILD -koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) $(VULKAN_BUILD) -koboldcpp_vulkan_failsafe: ggml_v4_vulkan_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLER) $(OBJS) +koboldcpp_vulkan_failsafe: ggml_v4_vulkan_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLER) $(OBJS) $(VULKAN_BUILD) else koboldcpp_vulkan_noavx2: diff --git a/expose.cpp b/expose.cpp index eaee1407e..7af56114c 100644 --- a/expose.cpp +++ b/expose.cpp @@ -242,6 +242,15 @@ extern "C" return embeddingstype_generate(inputs); } + bool music_load_model(const music_load_model_inputs inputs) + { + return musictype_load_model(inputs); + } + music_generation_outputs music_generate(const music_generation_inputs inputs) + { + return musictype_generate(inputs); + } + const char * new_token(int idx) { if (generated_tokens.size() <= idx || idx < 0) return nullptr; diff --git a/expose.h b/expose.h index 923e23c54..2785a5758 100644 --- a/expose.h +++ b/expose.h @@ -326,6 +326,28 @@ struct embeddings_generation_outputs const char * data = ""; }; +struct music_load_model_inputs +{ + const char * musicllm_filename = nullptr; + const char * musicembedding_filename = nullptr; + const char * musicdiffusion_filename = nullptr; + const char * musicvae_filename = nullptr; + const char * executable_path = nullptr; + const int kcpp_main_gpu = 0; + const char * vulkan_info = nullptr; + const char * devices_override = nullptr; + const bool quiet = false; + const int debugmode = 0; +}; +struct music_generation_inputs +{ + const char * prompt = nullptr; +}; +struct music_generation_outputs +{ + int status = -1; +}; + extern std::string executable_path; extern std::string lora_filename; extern std::string mmproj_filename; diff --git a/koboldcpp.py b/koboldcpp.py index 281c17c57..4b096cd0e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -87,6 +87,7 @@ password = "" #if empty, no auth key required fullwhispermodelpath = "" #if empty, it's not initialized ttsmodelpath = "" #if empty, not initialized embeddingsmodelpath = "" #if empty, not initialized +musicdiffusionmodelpath = "" #if empty, not initialized maxctx = 8192 maxhordectx = 0 #set to whatever maxctx is if 0 maxhordelen = 1024 @@ -436,6 +437,24 @@ class embeddings_generation_outputs(ctypes.Structure): ("count", ctypes.c_int), ("data", ctypes.c_char_p)] +class music_load_model_inputs(ctypes.Structure): + _fields_ = [("musicllm_filename", ctypes.c_char_p), + ("musicembedding_filename", ctypes.c_char_p), + ("musicdiffusion_filename", ctypes.c_char_p), + ("musicvae_filename", ctypes.c_char_p), + ("executable_path", ctypes.c_char_p), + ("kcpp_main_gpu", ctypes.c_int), + ("vulkan_info", ctypes.c_char_p), + ("devices_override", ctypes.c_char_p), + ("quiet", ctypes.c_bool), + ("debugmode", ctypes.c_int)] + +class music_generation_inputs(ctypes.Structure): + _fields_ = [("prompt", ctypes.c_char_p)] + +class music_generation_outputs(ctypes.Structure): + _fields_ = [("status", ctypes.c_int)] + class StdoutRedirector: def __init__(self, writer): self.writer = writer @@ -798,6 +817,10 @@ def init_library(): handle.embeddings_load_model.restype = ctypes.c_bool handle.embeddings_generate.argtypes = [embeddings_generation_inputs] handle.embeddings_generate.restype = embeddings_generation_outputs + handle.music_load_model.argtypes = [music_load_model_inputs] + handle.music_load_model.restype = ctypes.c_bool + handle.music_generate.argtypes = [music_generation_inputs] + handle.music_generate.restype = music_generation_outputs handle.last_logprobs.restype = last_logprobs_outputs handle.detokenize.argtypes = [token_count_outputs] handle.detokenize.restype = ctypes.c_char_p @@ -1118,7 +1141,7 @@ def convert_json_to_gbnf(json_obj): return "" def get_capabilities(): - global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support, has_vision_support, mcp_connections + global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, musicdiffusionmodelpath, has_audio_support, has_vision_support, mcp_connections has_llm = not (friendlymodelname=="inactive") has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="") has_password = (password!="") @@ -1126,11 +1149,12 @@ def get_capabilities(): has_search = True if args.websearch else False has_tts = (ttsmodelpath!="") has_embeddings = (embeddingsmodelpath!="") + has_music = (musicdiffusionmodelpath!="") has_guidance = True if args.enableguidance else False has_jinja = True if args.jinja else False has_mcp = True if (args.mcpfile and mcp_connections and len(mcp_connections) > 0) else False admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0)) - return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision_support,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance, "jinja": has_jinja, "mcp":has_mcp} + return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision_support,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "music":has_music, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance, "jinja": has_jinja, "mcp":has_mcp} def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo chunk_size = 1024*1024*12 # read first 12mb of file @@ -2337,6 +2361,28 @@ def embeddings_generate(genparams): tokcnt += tmpcnt return {"count":tokcnt, "data":tokarrs} +def music_load_model(musicllm,musicembedding,musicdiffusion,musicvae): + global args + inputs = music_load_model_inputs() + inputs.musicllm_filename = musicllm.encode("UTF-8") + inputs.musicembedding_filename = musicembedding.encode("UTF-8") + inputs.musicdiffusion_filename = musicdiffusion.encode("UTF-8") + inputs.musicvae_filename = musicvae.encode("UTF-8") + inputs = set_backend_props(inputs) + ret = handle.music_load_model(inputs) + return ret + +def music_generate(genparams): + global args + prompt = genparams.get("prompt", "") + inputs = music_generation_inputs() + inputs.prompt = prompt.encode("UTF-8") + ret = handle.music_generate(inputs) + outstr = "" + if ret.status==1: + outstr = ret.data.decode("UTF-8","ignore") + return outstr + def tokenize_ids(countprompt,tcaddspecial): rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial) countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0 @@ -5403,7 +5449,7 @@ def show_gui(): if dlfile: args.model_param = dlfile load_config_cli(args.model_param) - if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.mcpfile and not args.nomodel: + if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.musicdiffusion and not args.mcpfile and not args.nomodel: global exitcounter exitcounter = 999 exit_with_error(2,"No gguf model or kcpps file was selected. Exiting.") @@ -5674,6 +5720,11 @@ def show_gui(): ttsmaxlen_var = ctk.StringVar(value=str(default_ttsmaxlen)) tts_dir_var = ctk.StringVar() + musicllm_var = ctk.StringVar() + musicembeddings_var = ctk.StringVar() + musicdiffusion_var = ctk.StringVar() + musicvae_var = ctk.StringVar() + embeddings_model_var = ctk.StringVar() embeddings_ctx_var = ctk.StringVar(value=str("")) embeddings_gpu_var = ctk.IntVar(value=0) @@ -6464,14 +6515,19 @@ def show_gui(): whisper_model_var.trace_add("write", gui_changed_modelfile) makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.") tts_model_var.trace_add("write", gui_changed_modelfile) - makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.") - makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.") - makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.") + makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=100,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.") + makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 5, 50,padx=300,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.", labelpadx=190) + makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS. Currently only works on OuteTTS.") ttsgpu_var.trace_add("write", gui_changed_modelfile) makefileentry(audio_tab, "WavTokenizer Model (Required for some models):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.") wavtokenizer_var.trace_add("write", gui_changed_modelfile) makefileentry(audio_tab, "TTS Voices Dir:", "Select directory containing voices for voice cloning", tts_dir_var, 20, width=280, singlerow=True, dialog_type=2, tooltiptxt="Select directory containing voices for voice cloning") + makefileentry(audio_tab, "MusicLLM:", "Select music LLM model (e.g acestep-5Hz-lm-0.6B)", musicllm_var, 30, width=280, singlerow=True, dialog_type=0, tooltiptxt="Select music LLM model (e.g acestep-5Hz-lm)") + makefileentry(audio_tab, "MusicEmbeds:", "Select music embedding model (e.g Qwen3-Embedding-0.6B)", musicembeddings_var, 32, width=280, singlerow=True, dialog_type=0, tooltiptxt="Select music embedding model (e.g Qwen3-Embedding-0.6B)") + makefileentry(audio_tab, "MusicDiffuser:", "Select music diffusion (DiT) model (e.g acestep-v15-turbo)", musicdiffusion_var, 34, width=280, singlerow=True, dialog_type=0, tooltiptxt="Select music diffusion (DiT) model (e.g acestep-v15-turbo)") + makefileentry(audio_tab, "MusicVAE:", "Select music VAE model", musicvae_var, 36, width=280, singlerow=True, dialog_type=0, tooltiptxt="Select music VAE model") + admin_tab = tabcontent["Admin"] def toggleadmin(a,b,c): @@ -6561,7 +6617,7 @@ def show_gui(): # launch def guilaunch(): - if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and embeddings_model_var.get() == "" and nomodel.get()!=1: + if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and embeddings_model_var.get() == "" and musicdiffusion_var.get() == "" and nomodel.get()!=1: tmp = zentk_askopenfilename(title="Select ggml model .bin or .gguf file") model_var.set(tmp) nonlocal nextstate @@ -6786,6 +6842,11 @@ def show_gui(): args.ttsmaxlen = (default_ttsmaxlen if ttsmaxlen_var.get()=="" else int(ttsmaxlen_var.get())) args.ttsdir = tts_dir_var.get() + args.musicllm = musicllm_var.get() + args.musicembeddings = musicembeddings_var.get() + args.musicdiffusion = musicdiffusion_var.get() + args.musicvae = musicvae_var.get() + args.admin = (admin_var.get()==1 and not args.cli) args.admindir = admin_dir_var.get() args.adminpassword = admin_password_var.get() @@ -7028,6 +7089,11 @@ def show_gui(): ttsmaxlen_var.set(str(dict["ttsmaxlen"]) if ("ttsmaxlen" in dict and dict["ttsmaxlen"]) else str(default_ttsmaxlen)) tts_dir_var.set(dict["ttsdir"] if ("ttsdir" in dict and dict["ttsdir"]) else "") + musicllm_var.set(dict["musicllm"] if ("musicllm" in dict and dict["musicllm"]) else "") + musicembeddings_var.set(dict["musicembeddings"] if ("musicembeddings" in dict and dict["musicembeddings"]) else "") + musicdiffusion_var.set(dict["musicdiffusion"] if ("musicdiffusion" in dict and dict["musicdiffusion"]) else "") + musicvae_var.set(dict["musicvae"] if ("musicvae" in dict and dict["musicvae"]) else "") + embeddings_model_var.set(dict["embeddingsmodel"] if ("embeddingsmodel" in dict and dict["embeddingsmodel"]) else "") embeddings_ctx_var.set(str(dict["embeddingsmaxctx"]) if ("embeddingsmaxctx" in dict and dict["embeddingsmaxctx"]) else "") embeddings_gpu_var.set(dict["embeddingsgpu"] if ("embeddingsgpu" in dict) else 0) @@ -7166,7 +7232,7 @@ def show_gui(): kcpp_exporting_template = False export_vars() - if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.mcpfile and not args.nomodel: + if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.musicdiffusion and not args.mcpfile and not args.nomodel: exitcounter = 999 print("") time.sleep(0.5) @@ -8023,7 +8089,7 @@ def main(launch_args, default_args): load_config_cli(args.model_param) # show the GUI launcher if a model was not provided - if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.mcpfile and not args.nomodel): + if args.showgui or (not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.embeddingsmodel and not args.musicdiffusion and not args.mcpfile and not args.nomodel): #give them a chance to pick a file print("For command line arguments, please refer to --help") print("***") @@ -8145,7 +8211,7 @@ def main(launch_args, default_args): def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, embedded_kailite_gz, embedded_kcpp_docs_gz, embedded_kcpp_sdui_gz, embedded_lcpp_ui_gz, start_time, exitcounter, global_memory, using_gui_launcher - global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support, cached_chat_template + global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, musicdiffusionmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support, cached_chat_template start_server = True @@ -8307,6 +8373,23 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): if dlfile: args.embeddingsmodel = dlfile + if args.musicllm and args.musicllm!="": + dlfile = download_model_from_url(args.musicllm,[".gguf"],min_file_size=500000) + if dlfile: + args.musicllm = dlfile + if args.musicembeddings and args.musicembeddings!="": + dlfile = download_model_from_url(args.musicembeddings,[".gguf"],min_file_size=500000) + if dlfile: + args.musicembeddings = dlfile + if args.musicdiffusion and args.musicdiffusion!="": + dlfile = download_model_from_url(args.musicdiffusion,[".gguf"],min_file_size=500000) + if dlfile: + args.musicdiffusion = dlfile + if args.musicvae and args.musicvae!="": + dlfile = download_model_from_url(args.musicvae,[".gguf"],min_file_size=500000) + if dlfile: + args.musicvae = dlfile + # sanitize and replace the default vanity name. remember me.... if args.model_param and args.model_param!="": newmdldisplayname = os.path.basename(args.model_param) @@ -8675,6 +8758,28 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): exitcounter = 999 exit_with_error(3,"Could not load Embeddings model!") + #handle music model + if (args.musicdiffusion and args.musicdiffusion!="") or (args.musicllm and args.musicllm!="") or (args.musicembeddings and args.musicembeddings!="") or (args.musicvae and args.musicvae!=""): + if not os.path.exists(args.musicllm) or not os.path.exists(args.musicembeddings) or not os.path.exists(args.musicdiffusion) or not os.path.exists(args.musicvae): + if args.ignoremissing: + print("Ignoring missing Music model files!") + args.musicllm = None + args.musicembeddings = None + args.musicdiffusion = None + args.musicvae = None + else: + exitcounter = 999 + exit_with_error(2,"Cannot find music model files or missing a music model. Make sure ALL 4 music models (llm,embed,diffusion and vae) are loaded!") + else: + musicdiffusionmodelpath = os.path.abspath(args.musicdiffusion) + musicembedpath = os.path.abspath(args.musicembeddings) + musicllmpath = os.path.abspath(args.musicllm) + musicvaepath = os.path.abspath(args.musicvae) + loadok = music_load_model(musicllmpath,musicembedpath,musicdiffusionmodelpath,musicvaepath) + print("Load Music Model OK: " + str(loadok)) + if not loadok: + exitcounter = 999 + exit_with_error(3,"Could not load Music model!") #load embedded lite embddir = os.path.join(os.path.abspath(os.path.dirname(os.path.realpath(__file__))),"embd_res") @@ -9114,6 +9219,12 @@ if __name__ == '__main__': ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0) ttsparsergroup.add_argument("--ttsdir", metavar=('[directory]'), help="Select directory containing voices for voice cloning.", default="") + musicparsergroup = parser.add_argument_group('Music Gen Commands') + musicparsergroup.add_argument("--musicllm", metavar=('[filename]'), help="Select music LLM model (e.g acestep-5Hz-lm-0.6B)", default="") + musicparsergroup.add_argument("--musicembeddings", metavar=('[filename]'), help="Select music embedding model (e.g Qwen3-Embedding-0.6B)", default="") + musicparsergroup.add_argument("--musicdiffusion", metavar=('[filename]'), help="Select music diffusion (DiT) model (e.g acestep-v15-turbo)", default="") + musicparsergroup.add_argument("--musicvae", metavar=('[filename]'), help="Select music VAE model", default="") + embeddingsparsergroup = parser.add_argument_group('Embeddings Model Commands') embeddingsparsergroup.add_argument("--embeddingsmodel", metavar=('[filename]'), help="Specify an embeddings model to be loaded for generating embedding vectors.", default="") embeddingsparsergroup.add_argument("--embeddingsmaxctx", metavar=('[amount]'), help="Overrides the default maximum supported context of an embeddings model (defaults to trained context).", type=int, default=0) diff --git a/model_adapter.h b/model_adapter.h index 98ee2f551..a1f860c49 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -120,6 +120,9 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs); bool embeddingstype_load_model(const embeddings_load_model_inputs inputs); embeddings_generation_outputs embeddingstype_generate(const embeddings_generation_inputs inputs); +bool musictype_load_model(const music_load_model_inputs inputs); +music_generation_outputs musictype_generate(const music_generation_inputs inputs); + void timer_start(); double timer_check(); void print_tok_vec(std::vector &embd); @@ -130,9 +133,7 @@ bool ArrStartWith(const std::vector targetArray, const std::vector sea int ArrFindIndexOf(const std::vector targetArray, const std::vector searchSeq); FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta); -void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, - int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, - const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed); +void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed); bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch); std::string gguf_get_model_arch(const std::string & filename); diff --git a/otherarch/acestep/ace-qwen3.cpp b/otherarch/acestep/ace-qwen3.cpp index 20880b00f..464d7b893 100644 --- a/otherarch/acestep/ace-qwen3.cpp +++ b/otherarch/acestep/ace-qwen3.cpp @@ -1096,70 +1096,51 @@ static void usage(const char * prog) { "\n", prog); } -int main(int argc, char ** argv) { - const char * model_path = nullptr; - const char * request_path = nullptr; + +//kcpp stuff + +static Qwen3LM acestep_llm; +static BPETokenizer acestep_bpe; +static bool acestep_loaded = false; + +bool load_acestep(std::string model_path) +{ + acestep_loaded = false; int max_seq = 8192; - int batch_size = 1; - bool use_fsm = true; - const char * dump_logits = nullptr; - const char * dump_tokens = nullptr; + const int batch_size = 1; //only bs 1 is allowed + if (!load_bpe_from_gguf(&acestep_bpe, model_path.c_str())) { + return false; + } + // Load model + int n_kv_sets = 2 * batch_size; + if (!qw3lm_load(&acestep_llm, model_path.c_str(), max_seq, n_kv_sets)) { + return false; + } + acestep_loaded = true; + return true; +} - if (argc < 2) { - usage(argv[0]); - return 1; +AceRequest acestep_prepare_request() +{ + const int batch_size = 1; + bool use_fsm = true; + MetadataFSM fsm; + if (use_fsm) { + fsm.init(acestep_bpe, acestep_llm.cfg.vocab_size); } - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "--model") && i + 1 < argc) - model_path = argv[++i]; - else if (!strcmp(argv[i], "--request") && i + 1 < argc) - request_path = argv[++i]; - else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc) - max_seq = atoi(argv[++i]); - else if (!strcmp(argv[i], "--batch") && i + 1 < argc) - batch_size = atoi(argv[++i]); - else if (!strcmp(argv[i], "--no-fsm")) - use_fsm = false; - else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc) - dump_logits = argv[++i]; - else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc) - dump_tokens = argv[++i]; - else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { - usage(argv[0]); - return 0; - } - else { - fprintf(stderr, "Unknown option: %s\n", argv[i]); - usage(argv[0]); - return 1; - } - } - - if (!model_path) { - fprintf(stderr, "ERROR: --model required\n"); - usage(argv[0]); return 1; - } - if (!request_path) { - fprintf(stderr, "ERROR: --request required\n"); - usage(argv[0]); return 1; - } - - // Read request JSON + // Read request and set essentials AceRequest req; - if (!request_parse(&req, request_path)) return 1; - request_dump(&req, stderr); + request_init(&req); + req.caption = "doom"; + req.lyrics = ""; //can be overridden or left auto + req.inference_steps = 8; + req.vocal_language = "en"; - if (req.caption.empty()) { - fprintf(stderr, "ERROR: caption is empty in %s\n", request_path); - return 1; - } - - // Resolve seed int seed = req.seed; - if (seed < 0) { - std::random_device rd; - seed = (int)(rd() & 0x7FFFFFFF); + if (seed <= 0 || seed==0xFFFFFFFF) + { + seed = (((uint32_t)time(NULL)) % 1000000u); } req.seed = seed; @@ -1169,23 +1150,6 @@ int main(int argc, char ** argv) { float cfg_scale = req.lm_cfg_scale; const char * neg_prompt = req.lm_negative_prompt.c_str(); - Timer t_total; - - // Load BPE tokenizer from model GGUF - BPETokenizer bpe; - if (!load_bpe_from_gguf(&bpe, model_path)) return 1; - - // Load model - int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size; - Timer t_load; - Qwen3LM model; - if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1; - double load_ms = t_load.ms(); - - // FSM - MetadataFSM fsm; - if (use_fsm) fsm.init(bpe, model.cfg.vocab_size); - // Copy request -> AcePrompt (internal LLM struct) AcePrompt ace = {}; ace.caption = req.caption; @@ -1216,24 +1180,24 @@ int main(int argc, char ** argv) { " and specific musical description:\n"; std::string user_msg = ace.caption + "\n\ninstrumental: " + std::string(req.instrumental ? "true" : "false"); - prompt = build_custom_prompt(bpe, sys, user_msg.c_str()); + prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str()); // FSM: reset then optionally force language (shared for both paths) fsm.reset(); if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) - fsm.force_language(bpe, ace.vocal_language); + fsm.force_language(acestep_bpe, ace.vocal_language); // Phase 1: N lyrics + metadata generations (always batched, N=batch_size) fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %d..%d\n", prompt.size(), batch_size, seed, seed + batch_size - 1); auto phase1_texts = generate_phase1_batch( - &model, &bpe, prompt, 2048, temperature, 1.0f, + &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 1.0f, seed, batch_size, use_fsm ? &fsm : nullptr, true); parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true); - for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i); + for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); } // Re-evaluate after possible simple enrichment @@ -1243,97 +1207,300 @@ int main(int argc, char ** argv) { if (!has_all_metas) { // Partial-metas: Phase 1 with CFG to fill missing fields - prompt = build_lm_prompt(bpe, ace); + prompt = build_lm_prompt(acestep_bpe, ace); std::vector uncond; if (cfg_scale > 1.0f) - uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt); + uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt); fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %d..%d\n", prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1); fsm.reset(); auto phase1_texts = generate_phase1_batch( - &model, &bpe, prompt, 2048, temperature, top_p, + &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, seed, batch_size, use_fsm ? &fsm : nullptr, false, cfg_scale, uncond.empty() ? nullptr : &uncond, true); parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false); - for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i); + for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); } // Guarantee aces is populated (all-metas: single shared ace for prefill optimization) - if (aces.empty()) aces = {ace}; - - // Debug: dump tokens/logits - if (need_lm_codes && (dump_logits || dump_tokens)) { - std::string cot = build_cot_yaml(aces[0]); - auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot); - - if (dump_tokens) { - FILE * f = fopen(dump_tokens, "w"); - if (f) { - for (size_t j = 0; j < dbg_prompt.size(); j++) - fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]); - fprintf(f, "\n"); - fclose(f); - fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n", - dump_tokens, dbg_prompt.size()); - } - } - if (dump_logits) { - std::vector dbg_logits(model.cfg.vocab_size); - qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data()); - FILE * f = fopen(dump_logits, "wb"); - if (f) { - fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f); - fclose(f); - fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n", - dump_logits, model.cfg.vocab_size, - (int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin())); - } - qw3lm_reset_kv(&model, 0); - } + if (aces.empty()) { + aces = { ace }; } // Phase 2: generate audio codes (always batched, N=batch_size) std::vector batch_codes(batch_size); if (need_lm_codes) { - batch_codes = run_phase2_batch(&model, bpe, aces, + batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces, temperature, top_p, seed, batch_size, cfg_scale, neg_prompt); } else { fprintf(stderr, "[Skip] %s, no code generation\n", user_has_codes ? "user codes present" : "thinking=false"); } - // Write N output files: request0.json, request1.json, ... - { - std::string base(request_path); - std::string ext = ".json"; - size_t dot = base.rfind('.'); - if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); } - for (int b = 0; b < batch_size; b++) { - AceRequest rr = req; - const AcePrompt & a = aces[b < (int)aces.size() ? b : 0]; - rr.caption = a.caption; - rr.lyrics = a.lyrics; - rr.bpm = a.bpm; - rr.duration = a.duration; - rr.keyscale = a.keyscale; - rr.timesignature = a.timesignature; - rr.vocal_language = a.vocal_language; - if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b]; - rr.seed = seed + b; - char path[512]; - snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str()); - request_write(&rr, path); - fprintf(stderr, "[Output] Wrote %s\n", path); - } - } + // only batch size 1 is allowed + AceRequest rr = req; + const AcePrompt & a = aces[0]; + rr.caption = a.caption; + rr.lyrics = a.lyrics; + rr.bpm = a.bpm; + rr.duration = a.duration; + rr.keyscale = a.keyscale; + rr.timesignature = a.timesignature; + rr.vocal_language = a.vocal_language; + if (!batch_codes[0].empty()) rr.audio_codes = batch_codes[0]; + rr.seed = seed; - fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%d\n", - load_ms, t_total.ms(), seed); - - qw3lm_free(&model); - return 0; + return rr; } + +void unload_acestep() +{ + qw3lm_free(&acestep_llm); +} + + +// int main(int argc, char ** argv) { +// const char * model_path = nullptr; +// const char * request_path = nullptr; +// int max_seq = 8192; +// int batch_size = 1; +// bool use_fsm = true; +// const char * dump_logits = nullptr; +// const char * dump_tokens = nullptr; + +// if (argc < 2) { +// usage(argv[0]); +// return 1; +// } + +// for (int i = 1; i < argc; i++) { +// if (!strcmp(argv[i], "--model") && i + 1 < argc) +// model_path = argv[++i]; +// else if (!strcmp(argv[i], "--request") && i + 1 < argc) +// request_path = argv[++i]; +// else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc) +// max_seq = atoi(argv[++i]); +// else if (!strcmp(argv[i], "--batch") && i + 1 < argc) +// batch_size = atoi(argv[++i]); +// else if (!strcmp(argv[i], "--no-fsm")) +// use_fsm = false; +// else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc) +// dump_logits = argv[++i]; +// else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc) +// dump_tokens = argv[++i]; +// else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { +// usage(argv[0]); +// return 0; +// } +// else { +// fprintf(stderr, "Unknown option: %s\n", argv[i]); +// usage(argv[0]); +// return 1; +// } +// } + +// if (!model_path) { +// fprintf(stderr, "ERROR: --model required\n"); +// usage(argv[0]); return 1; +// } +// if (!request_path) { +// fprintf(stderr, "ERROR: --request required\n"); +// usage(argv[0]); return 1; +// } + +// // Read request JSON +// AceRequest req; +// if (!request_parse(&req, request_path)) return 1; +// request_dump(&req, stderr); + +// if (req.caption.empty()) { +// fprintf(stderr, "ERROR: caption is empty in %s\n", request_path); +// return 1; +// } + +// // Resolve seed +// int seed = req.seed; +// if (seed < 0) { +// std::random_device rd; +// seed = (int)(rd() & 0x7FFFFFFF); +// } +// req.seed = seed; + +// // Generation params from request +// float temperature = req.lm_temperature; +// float top_p = req.lm_top_p; +// float cfg_scale = req.lm_cfg_scale; +// const char * neg_prompt = req.lm_negative_prompt.c_str(); + +// Timer t_total; + +// // Load BPE tokenizer from model GGUF +// BPETokenizer bpe; +// if (!load_bpe_from_gguf(&bpe, model_path)) return 1; + +// // Load model +// int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size; +// Timer t_load; +// Qwen3LM model; +// if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1; +// double load_ms = t_load.ms(); + +// // FSM +// MetadataFSM fsm; +// if (use_fsm) fsm.init(bpe, model.cfg.vocab_size); + +// // Copy request -> AcePrompt (internal LLM struct) +// AcePrompt ace = {}; +// ace.caption = req.caption; +// ace.lyrics = req.lyrics; +// ace.duration = req.duration; +// ace.bpm = req.bpm; +// ace.keyscale = req.keyscale; +// ace.timesignature = req.timesignature; +// ace.vocal_language = req.vocal_language; + +// bool user_has_codes = !req.audio_codes.empty(); +// bool need_lm_codes = req.thinking && !user_has_codes; + +// bool is_simple = ace.lyrics.empty() && +// ace.bpm <= 0 && ace.duration <= 0 && +// ace.keyscale.empty() && ace.timesignature.empty(); + +// std::vector prompt; +// std::vector aces; // populated by Phase 1 (simple or partial) + +// // Preprocessor: simple mode generates lyrics + metas from caption +// if (is_simple) { +// fprintf(stderr, "[Simple] Inspiration\n"); + +// const char * sys = +// "# Instruction\n" +// "Expand the user's input into a more detailed" +// " and specific musical description:\n"; +// std::string user_msg = ace.caption + "\n\ninstrumental: " +// + std::string(req.instrumental ? "true" : "false"); +// prompt = build_custom_prompt(bpe, sys, user_msg.c_str()); + +// // FSM: reset then optionally force language (shared for both paths) +// fsm.reset(); +// if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) +// fsm.force_language(bpe, ace.vocal_language); + +// // Phase 1: N lyrics + metadata generations (always batched, N=batch_size) +// fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %d..%d\n", +// prompt.size(), batch_size, seed, seed + batch_size - 1); + +// auto phase1_texts = generate_phase1_batch( +// &model, &bpe, prompt, 2048, temperature, 1.0f, +// seed, batch_size, use_fsm ? &fsm : nullptr, true); + +// parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true); + +// for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i); +// } + +// // Re-evaluate after possible simple enrichment +// const AcePrompt & ace_ref = aces.empty() ? ace : aces[0]; +// bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 && +// !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty()); + +// if (!has_all_metas) { +// // Partial-metas: Phase 1 with CFG to fill missing fields +// prompt = build_lm_prompt(bpe, ace); +// std::vector uncond; +// if (cfg_scale > 1.0f) +// uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt); + +// fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %d..%d\n", +// prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1); + +// fsm.reset(); +// auto phase1_texts = generate_phase1_batch( +// &model, &bpe, prompt, 2048, temperature, top_p, +// seed, batch_size, use_fsm ? &fsm : nullptr, false, +// cfg_scale, uncond.empty() ? nullptr : &uncond, true); + +// parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false); + +// for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i); +// } + +// // Guarantee aces is populated (all-metas: single shared ace for prefill optimization) +// if (aces.empty()) aces = {ace}; + +// // Debug: dump tokens/logits +// if (need_lm_codes && (dump_logits || dump_tokens)) { +// std::string cot = build_cot_yaml(aces[0]); +// auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot); + +// if (dump_tokens) { +// FILE * f = fopen(dump_tokens, "w"); +// if (f) { +// for (size_t j = 0; j < dbg_prompt.size(); j++) +// fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]); +// fprintf(f, "\n"); +// fclose(f); +// fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n", +// dump_tokens, dbg_prompt.size()); +// } +// } +// if (dump_logits) { +// std::vector dbg_logits(model.cfg.vocab_size); +// qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data()); +// FILE * f = fopen(dump_logits, "wb"); +// if (f) { +// fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f); +// fclose(f); +// fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n", +// dump_logits, model.cfg.vocab_size, +// (int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin())); +// } +// qw3lm_reset_kv(&model, 0); +// } +// } + +// // Phase 2: generate audio codes (always batched, N=batch_size) +// std::vector batch_codes(batch_size); +// if (need_lm_codes) { +// batch_codes = run_phase2_batch(&model, bpe, aces, +// temperature, top_p, seed, batch_size, cfg_scale, neg_prompt); +// } else { +// fprintf(stderr, "[Skip] %s, no code generation\n", +// user_has_codes ? "user codes present" : "thinking=false"); +// } + +// // Write N output files: request0.json, request1.json, ... +// { +// std::string base(request_path); +// std::string ext = ".json"; +// size_t dot = base.rfind('.'); +// if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); } +// for (int b = 0; b < batch_size; b++) { +// AceRequest rr = req; +// const AcePrompt & a = aces[b < (int)aces.size() ? b : 0]; +// rr.caption = a.caption; +// rr.lyrics = a.lyrics; +// rr.bpm = a.bpm; +// rr.duration = a.duration; +// rr.keyscale = a.keyscale; +// rr.timesignature = a.timesignature; +// rr.vocal_language = a.vocal_language; +// if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b]; +// rr.seed = seed + b; +// char path[512]; +// snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str()); +// request_write(&rr, path); +// fprintf(stderr, "[Output] Wrote %s\n", path); +// } +// } + +// fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%d\n", +// load_ms, t_total.ms(), seed); + +// qw3lm_free(&model); +// return 0; +// } diff --git a/otherarch/acestep/music_adapter.cpp b/otherarch/acestep/music_adapter.cpp new file mode 100644 index 000000000..1b32d5762 --- /dev/null +++ b/otherarch/acestep/music_adapter.cpp @@ -0,0 +1,85 @@ +#include "model_adapter.h" +#include "otherarch/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "./request.cpp" +#include "./ace-qwen3.cpp" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static int musicdebugmode = 0; +static bool music_is_quiet = false; +static bool musicgen_loaded = false; +static std::string musicvulkandeviceenv; + +bool musictype_load_model(const music_load_model_inputs inputs) +{ + music_is_quiet = inputs.quiet; + + //duplicated from expose.cpp + std::string vulkan_info_raw = inputs.vulkan_info; + std::string vulkan_info_str = ""; + for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { + vulkan_info_str += vulkan_info_raw[i]; + if (i < vulkan_info_raw.length() - 1) { + vulkan_info_str += ","; + } + } + const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); + if(!existingenv && vulkan_info_str!="") + { + musicvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; + putenv((char*)musicvulkandeviceenv.c_str()); + } + + std::string musicllm_filename = inputs.musicllm_filename; + std::string musicembedding_filename = inputs.musicembedding_filename; + std::string musicdiffusion_filename = inputs.musicdiffusion_filename; + std::string musicvae_filename = inputs.musicvae_filename; + printf("\nLoading Music Gen LLM Model: %s\nLoading Music Gen Embed Model: %s\nLoading Music Gen Diffusion Model: %s\nLoading Music Gen VAE Model: %s\n", + musicllm_filename.c_str(),musicembedding_filename.c_str(),musicdiffusion_filename.c_str(),musicvae_filename.c_str()); + musicdebugmode = inputs.debugmode; + + bool ok = load_acestep(musicllm_filename); + if (!ok) { + printf("\nFailed to load Music Gen Model!\n"); + return false; + } + + musicgen_loaded = true; + + printf("\nMusic Gen Load Complete.\n"); + return true; +} + +music_generation_outputs musictype_generate(const music_generation_inputs inputs) +{ + music_generation_outputs output; + + if(!musicgen_loaded) + { + printf("\nWarning: KCPP music gen not initialized!\n"); + output.status = 0; + return output; + } + + if(!music_is_quiet) + { + printf("\nMusic Gen Generating..."); + } + + output.status = 1; + return output; +}